### Web scraping to collect list of benign urls

This notebook contains code used to collect 25,000+ urls by crawling a list of Alexa top websites. The urls were subsequently submitted to virustotal.com, to verify they have a clean reputation.

In [None]:
# read in list of domains to crawl

import pandas as pd

top_2500 = pd.read_csv('top_2500.csv')
top_2500  = top_2500.domain.to_list()

In [None]:
# get a list of proxies to use

from lxml.html import fromstring
import requests
from itertools import cycle
import traceback

def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies

proxies = get_proxies()

proxies  # to use with get_all_website_links function

In [None]:
import requests
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
import random

# init the colorama module
colorama.init()

GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

total_urls_visited = 0

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    
    proxies =  ['103.102.14.128:8080',
 '118.172.51.110:36552',
 '118.174.220.14:43473',
 '159.65.135.75:8080',
 '163.172.226.142:3838',
 '177.99.206.82:8080',
 '182.160.117.130:53281',
 '200.108.183.2:8080',
 '212.233.109.70:3128',
 '86.125.112.230:57373'] #######  copy in list of proxies
    
    #proxy_pool = cycle(proxies) 
    proxy = random.choice(proxies)
    
    try: 
        soup = BeautifulSoup(requests.get(url, proxies={"http": proxy, "https": proxy}).content, "html.parser") ####
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            #href = parsed_href 
            # will request all url segments below: 
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + parsed_href.params + parsed_href.query + parsed_href.fragment
            if not is_valid(href):
                # not a valid URL
                continue
            if href in internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                if href not in external_urls:
                    print(f"{GRAY}[!] External link: {href}{RESET}")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET}")
            urls.add(href)
            internal_urls.add(href)
        return urls
    except:
        #Most free proxies will often get connection errors.
        #We will skip retries
        print("Skipping. Connnection error")

def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_website_links(url)
    print(links)
    if links != None:    
        for link in links:
            if total_urls_visited > max_urls:
                break
            crawl(link, max_urls=max_urls)


In [None]:
# loop through the domain list and crawl sites

for site in top_2500:
    crawl('http://' + site,  max_urls = 10)

In [None]:
# combine both internal and external urls (unique list)
alexa_urls = internal_urls.union(external_urls)

In [None]:
len(alexa_urls)

In [None]:
# create a dataframe and create new features by parsing url components
df = pd.DataFrame(data = alexa_urls, columns = ['url'])
df['scheme'],df['netloc'],df['path'],df['params'],df['query'],df['fragment'] = zip(*df['url'].map(urlparse))

In [None]:
# save to a csv file
df.to_csv('alexa_urls.csv')