In [1]:
import requests
import pandas as pd
import os
import urllib.parse

In [2]:
URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY')

request_headers = {
    'api-key': URLSCAN_API_KEY,
}

In [3]:
output = pd.DataFrame()

In [4]:
querylist = []
with open("/path/to/query/urlscanquery.txt", "r") as input:
    for line in input:
        line = line.strip()
        querylist.append(line)

In [5]:
for q in querylist:
    try:
        query = urllib.parse.quote(q, safe='')

        url = "https://www.urlscan.io/api/v1/search?q=" + query
        r = requests.get(url, headers=request_headers)
        data = r.json()

        results = data["results"]

        for l in results:
            key_df = pd.DataFrame()
            for key in l:
                this_df = pd.DataFrame([l[key]])
                this_df = this_df.add_prefix(str(key)+"_")
                key_df = pd.concat([key_df, this_df], axis=1)
                
            output = pd.concat([key_df, output])

    except Exception as e: print(e)


In [6]:
output.head()

Unnamed: 0,verdicts_score,verdicts_malicious,verdicts_hasVerdicts,submitter_country,dom_size,dom_hash,frames_length,task_visibility,task_method,task_domain,...,result_0,screenshot_0,task_tags,page_tlsValidDays,page_tlsAgeDays,page_tlsValidFrom,page_tlsIssuer,verdicts_categories,brand_0,task_source
0,0,False,False,gb,17408,700eec05183ba56fb53b1c135efc0b1b10b3b3536247b5...,2,public,manual,blackhiawk-okta.com,...,https://urlscan.io/api/v1/result/63888005-49af...,https://urlscan.io/screenshots/63888005-49af-4...,,,,,,,,
0,0,False,False,gb,17408,700eec05183ba56fb53b1c135efc0b1b10b3b3536247b5...,2,public,manual,blackhiawk-okta.com,...,https://urlscan.io/api/v1/result/e60a95fd-a3f5...,https://urlscan.io/screenshots/e60a95fd-a3f5-4...,,,,,,,,
0,0,False,False,us,17408,700eec05183ba56fb53b1c135efc0b1b10b3b3536247b5...,2,unlisted,api,blackhiawk-okta.com,...,https://urlscan.io/api/v1/result/ea0271f7-0444...,https://urlscan.io/screenshots/ea0271f7-0444-4...,,,,,,,,
0,0,False,False,us,17408,700eec05183ba56fb53b1c135efc0b1b10b3b3536247b5...,2,public,api,blackhiawk-okta.com,...,https://urlscan.io/api/v1/result/c804ebc0-bf91...,https://urlscan.io/screenshots/c804ebc0-bf91-4...,[phishing],90.0,0.0,2023-02-22T00:00:00.000Z,"cPanel, Inc. Certification Authority",,,
0,0,False,False,us,17408,700eec05183ba56fb53b1c135efc0b1b10b3b3536247b5...,2,unlisted,api,blackhiawk-okta.com,...,https://urlscan.io/api/v1/result/75b1dd10-f54e...,https://urlscan.io/screenshots/75b1dd10-f54e-4...,,,,,,,,


In [11]:
output.columns

Index(['verdicts_score', 'verdicts_malicious', 'verdicts_hasVerdicts',
       'submitter_country', 'dom_size', 'dom_hash', 'frames_length',
       'task_visibility', 'task_method', 'task_domain', 'task_apexDomain',
       'task_time', 'task_uuid', 'task_url', 'stats_uniqIPs',
       'stats_uniqCountries', 'stats_dataLength', 'stats_encodedDataLength',
       'stats_requests', 'scanner_country', 'links_length', 'page_country',
       'page_server', 'page_redirected', 'page_ip', 'page_mimeType',
       'page_title', 'page_url', 'page_ptr', 'page_domain', 'page_apexDomain',
       'page_asnname', 'page_asn', 'page_status', 'text_size', 'text_hash',
       '_id_0', 'sort_0', 'sort_1', 'result_0', 'screenshot_0', 'task_tags',
       'page_tlsValidDays', 'page_tlsAgeDays', 'page_tlsValidFrom',
       'page_tlsIssuer', 'verdicts_categories', 'brand_0', 'task_source'],
      dtype='object')

In [7]:
output.task_domain.drop_duplicates()

0              blackhiawk-okta.com
0         mail.blackhiawk-okta.com
0             workspace-nikeus.com
0    qa-fundcentral.gcmlp-test.net
0                   159.89.144.101
0                   193.149.176.15
0                   68.183.160.167
0                      cgi-sso.com
0                     64.52.80.174
0                   64.190.113.139
0                 xub07-fdexwgl.us
0        cms-dashboard.alorica.com
0                    162.33.177.31
0                   64.190.113.120
0                      64.52.80.59
0                     ctl-help.com
0                   193.149.176.32
0                     45.61.136.40
0                   sprint-sso.net
0              simpleidentity.help
0                  sprint-corp.net
0                  157.230.128.156
0                    178.128.64.18
0                    162.33.179.55
Name: task_domain, dtype: object

In [12]:
output.page_domain.drop_duplicates()

0          blackhiawk-okta.com
0     mail.blackhiawk-okta.com
0         workspace-nikeus.com
0     qa-secure.gcmlp-test.net
0               159.89.144.101
0               193.149.176.15
0               68.183.160.167
0                  cgi-sso.com
0                 64.52.80.174
0               64.190.113.139
0             xub07-fdexwgl.us
0    cms-dashboard.alorica.com
0                162.33.177.31
0               64.190.113.120
0                  64.52.80.59
0                 ctl-help.com
0               193.149.176.32
0                 45.61.136.40
0               sprint-sso.net
0          simpleidentity.help
0              sprint-corp.net
0              157.230.128.156
0                178.128.64.18
0                162.33.179.55
Name: page_domain, dtype: object

In [9]:
output.to_csv('urlscan_output.csv')