Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
enenumxela committed Aug 18, 2021
2 parents b9f7995 + 149e3a3 commit e2342fd
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 19 deletions.
11 changes: 7 additions & 4 deletions README.md
Expand Up @@ -21,10 +21,13 @@ A passive reconnaissance tool for known URLs discovery - it gathers a list of UR

## Features

* Fetches known URLs from **[AlienVault's OTX](https://otx.alienvault.com/)**, **[Common Crawl](https://commoncrawl.org/)**, **[URLScan](https://urlscan.io/)**, **[Github](https://github.com)** and the **[Wayback Machine](https://archive.org/web/)**.
* Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
* Regex filter URLs.
* Save output to file.
* Collect known URLs:
* Fetches from **[AlienVault's OTX](https://otx.alienvault.com/)**, **[Common Crawl](https://commoncrawl.org/)**, **[URLScan](https://urlscan.io/)**, **[Github](https://github.com)** and the **[Wayback Machine](https://archive.org/web/)**.
* Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
* Reduce noise:
* Regex filter URLs.
* Removes duplicate pages in the sense of URL patterns that are probably repetitive and points to the same web template.
* Output to stdout for piping or save to file.

## Installation

Expand Down
53 changes: 38 additions & 15 deletions pkg/sigurlfind3r/sigurlfind3r.go
Expand Up @@ -2,8 +2,11 @@ package sigurlfind3r

import (
"context"
"fmt"
"net/url"
"regexp"

"github.com/enenumxela/urlx/pkg/urlx"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/passive"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/session"
Expand Down Expand Up @@ -59,35 +62,55 @@ func New(options ...*Options) (runner *Runner) {
func (runner *Runner) Run(ctx context.Context, domain string) (URLs chan scraping.URL, err error) {
URLs = make(chan scraping.URL)

// Create a unique map for filtering duplicate URLs out
uniqueMap := make(map[string]scraping.URL)
// Create a map to track sources for each URL
sourceMap := make(map[string]map[string]struct{})

results := runner.Passive.Run(domain, runner.FilterRegex, runner.Options.IncludeSubdomains, runner.Options.Keys)

deDupMap := make(map[string]url.Values)
uniqueMap := make(map[string]scraping.URL)

// Process the results in a separate goroutine
go func() {
defer close(URLs)

for result := range results {
URL := result.Value

if _, exists := uniqueMap[URL]; !exists {
sourceMap[URL] = make(map[string]struct{})
// unique urls - If the url already exists in the unique map
if _, exists := uniqueMap[result.Value]; exists {
continue
}

sourceMap[URL][result.Source] = struct{}{}

if _, exists := uniqueMap[URL]; exists {
parsedURL, err := urlx.Parse(result.Value)
if err != nil {
continue
}

hostEntry := scraping.URL{Source: result.Source, Value: URL}
// urls with query
if len(parsedURL.Query()) > 0 {
unique := false

key := fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Domain, parsedURL.Path)

if _, exists := deDupMap[key]; exists {
for parameter := range parsedURL.Query() {
if _, exists := deDupMap[key][parameter]; !exists {
deDupMap[key][parameter] = []string{"sigurlfind3r"}
unique = true
}
}
} else {
deDupMap[key] = parsedURL.Query()
unique = true
}

if !unique {
continue
}
}

uniqueMap[URL] = hostEntry
uniqueMap[parsedURL.String()] = scraping.URL{
Source: result.Source,
Value: parsedURL.String(),
}

URLs <- hostEntry
URLs <- uniqueMap[parsedURL.String()]
}
}()

Expand Down

0 comments on commit e2342fd

Please sign in to comment.