# Scrapping: Build urls map

### Source

- Medium: [How to build a URL crawler to map a website using Python](https://medium.com/swlh/how-to-build-a-url-crawler-to-map-a-website-using-python-3e7db83feb7a).

In [1]:
def add_local_address(url):
    if '#' in url:
        pass
    elif '.pdf' in url:
        pass
    elif 'mailto' in url:
        pass
    else:
        local_urls.add(url) 

In [7]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from urllib.parse import urlparse
from collections import deque
import re

#url = "https://jmquintana79.github.io"
#url = "https://www.jma.go.jp/jma/indexe.html"
#url = "https://www.endesa.com"
#url = "https://n-kishou.com/corp/"
#url = "https://www.ree.es/es"
#url = "https://www.meteologica.com"
url = "https://www.thomsonreuters.com/en/products-services/energy/top-100.html"
url = "https://globaledge.msu.edu/industries/energy/corporations"
# a queue of urls to be crawled
new_urls = deque([url])

# a set of urls that we have already been processed 
processed_urls = set()
# a set of domains inside the target website
local_urls = set()
# a set of domains outside the target website
foreign_urls = set()
# a set of broken urls
broken_urls = set()

# initialize counter
icount = 0
# process urls one by one until we exhaust the queue
while len(new_urls):
    # move next url from the queue to the set of processed urls
    url = new_urls.popleft()
    processed_urls.add(url)
    # get url's content
    print("Processing %s" %(url))
    try:
        response = requests.get(url)
    except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
        # add broken urls to it's own set, then continue
        broken_urls.add(url)
        continue
    
    # extract base url to resolve relative links
    parts = urlsplit(url)
    base = "{0.netloc}".format(parts)
    strip_base = base.replace("www.", "")
    base_url = "{0.scheme}://{0.netloc}".format(parts)
    path = url[:url.rfind('/')+1] if '/' in parts.path else url

    # create a beutiful soup for the html document
    soup = BeautifulSoup(response.text, "lxml")

    for link in soup.find_all('a'):
        # extract link url from the anchor
        anchor = link.attrs["href"] if "href" in link.attrs else ''

        if anchor.startswith('/'):
            local_link = base_url + anchor
            #local_urls.add(local_link)
            add_local_address(local_link)
        elif strip_base in anchor:
            #local_urls.add(anchor)
            add_local_address(anchor)
        elif not anchor.startswith('http'):
            local_link = path + anchor
            #local_urls.add(local_link)
            add_local_address(local_link)
        else:
            foreign_urls.add(anchor)

        # only local
        for i in local_urls:
            if not i in new_urls and not i in processed_urls:
                new_urls.append(i)
    # add counter
    icount += 1
    # stop
    if icount == 1:
        break


Processing https://globaledge.msu.edu/industries/energy/corporations


In [8]:
local_urls

{'https://globaledge.msu.edu/',
 'https://globaledge.msu.edu/academy',
 'https://globaledge.msu.edu/academy/announcements',
 'https://globaledge.msu.edu/academy/community-colleges',
 'https://globaledge.msu.edu/academy/course-content',
 'https://globaledge.msu.edu/academy/studyibfaq',
 'https://globaledge.msu.edu/account/register',
 'https://globaledge.msu.edu/blog',
 'https://globaledge.msu.edu/comparator',
 'https://globaledge.msu.edu/countries/argentina/corporations',
 'https://globaledge.msu.edu/countries/australia/corporations',
 'https://globaledge.msu.edu/countries/austria/corporations',
 'https://globaledge.msu.edu/countries/brazil/corporations',
 'https://globaledge.msu.edu/countries/canada/corporations',
 'https://globaledge.msu.edu/countries/china/corporations',
 'https://globaledge.msu.edu/countries/colombia/corporations',
 'https://globaledge.msu.edu/countries/cyprus/corporations',
 'https://globaledge.msu.edu/countries/czechia/corporations',
 'https://globaledge.msu.edu/c

In [9]:
foreign_urls

{'http://ceh.cosmo-oil.co.jp/',
 'http://en.ceec.net.cn/',
 'http://en.cgnpc.com.cn/',
 'http://eng.kogas-tech.co.kr/main.do',
 'http://eng.skinnovation.com/',
 'http://france.edf.com/',
 'http://hollyfrontier.com/',
 'http://interrao.ru/en/',
 'http://ir.ennenergy.com/en/global/home.php',
 'http://msu.edu',
 'http://novatek.ru/en/',
 'http://www.600795.com.cn/',
 'http://www.aep.com/',
 'http://www.aes.com/',
 'http://www.agl.com.au/',
 'http://www.alliantenergy.com',
 'http://www.ameren.com/',
 'http://www.atmosenergy.com/',
 'http://www.bharatpetroleum.com/',
 'http://www.bp.com/',
 'http://www.caltex.com.au/',
 'http://www.canadianutilities.com/',
 'http://www.caosco.com',
 'http://www.cenovus.com/',
 'http://www.centerpointenergy.com/',
 'http://www.centrica.com/',
 'http://www.cez.cz',
 'http://www.cheniere.com',
 'http://www.chevron.com/',
 'http://www.chinagasholdings.com.hk/',
 'http://www.chuden.co.jp/english/index.html',
 'http://www.clypg.com.cn/en/',
 'http://www.cmsenergy

### get foreign url base 

In [10]:
import urllib.parse
def base_url(url, with_path=False):
    parsed = urllib.parse.urlparse(url)
    path   = '/'.join(parsed.path.split('/')[:-1]) if with_path else ''
    parsed = parsed._replace(path=path)
    parsed = parsed._replace(params='')
    parsed = parsed._replace(query='')
    parsed = parsed._replace(fragment='')
    return parsed.geturl()

In [11]:
# foreign urls to respective base name
list(set([base_url(url) for url in foreign_urls]))

['http://www.hess.com',
 'http://www.chinagasholdings.com.hk',
 'http://www.repsol.com',
 'https://www.ovintiv.com',
 'http://www.dcc.ie',
 'http://www.verbund.com',
 'http://www.delek-group.com',
 'http://www.forbes.com',
 'http://www.ree.es',
 'https://www.firstenergycorp.com',
 'https://www.towngas.com',
 'http://www.kepco.co.kr',
 'http://www.gdi.com.hk',
 'http://www.edison.com',
 'http://interrao.ru',
 'http://www.alliantenergy.com',
 'http://www.sk.com',
 'http://www.cypc.com.cn',
 'http://www.surgutneftegas.ru',
 'http://www.eletrobras.com',
 'https://twitter.com',
 'http://www.atmosenergy.com',
 'http://www.chuden.co.jp',
 'http://www.jpower.co.jp',
 'https://www.lundin-energy.com',
 'http://www.marathonpetroleum.com',
 'http://www1.kepco.co.jp',
 'http://www.exxonmobil.com',
 'https://www.neste.us',
 'https://www.diamondbackenergy.com',
 'http://www.cheniere.com',
 'http://www.ameren.com',
 'http://www.powerassets.com',
 'http://www.tepco.co.jp',
 'http://www.ril.com',
 'http