Skip to content

Commit

Permalink
Make passive spider more intelligent
Browse files Browse the repository at this point in the history
  • Loading branch information
kpcyrd committed Nov 13, 2018
1 parent 241b52d commit b9aa341
Showing 1 changed file with 10 additions and 19 deletions.
29 changes: 10 additions & 19 deletions modules/dev/passive-spider.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
-- Source: urls
-- License: GPL-3.0

function entry(target, parent, href)
function entry(parent, href)
-- TODO: parse mailto:foo@example.com?subject=asdf
-- TODO: parse tel:+4912345
-- TODO: allow discovering 3rd-party domains
Expand All @@ -16,6 +16,7 @@ function entry(target, parent, href)
end

url = url_join(parent, href)
if last_err() then return clear_err() end
if url:match('^https?://') == nil then
return
end
Expand All @@ -25,19 +26,14 @@ function entry(target, parent, href)
host = parts['host']
psl = psl_domain_from_dns_name(host)

if psl ~= target then
-- TODO: this doesn't match the current target, but might match a different target in scope
-- if we can check an entry exists in the db we could make this more intelligent
return
end

domain_id = db_add('domain', {
value=psl,
})
db_add('subdomain', {
domain_id=domain_id,
value=host,
})
domain_id = db_select('domain', psl)
if domain_id ~= nil then
db_add('subdomain', {
domain_id=domain_id,
value=host,
})
end
end

function run(arg)
Expand All @@ -55,17 +51,12 @@ function run(arg)
return
end

-- get public suffix
url = url_parse(arg['value'])
if last_err() then return end
psl = psl_domain_from_dns_name(url['host'])

-- process html links
i = 1
while i <= #links do
href = links[i]['attrs']['href']

entry(psl, arg['value'], href)
entry(arg['value'], href)

i = i+1
end
Expand Down

0 comments on commit b9aa341

Please sign in to comment.