# Scrapping: Build urls map

### Source

- Medium: [How to build a URL crawler to map a website using Python](https://medium.com/swlh/how-to-build-a-url-crawler-to-map-a-website-using-python-3e7db83feb7a).

In [26]:
def add_local_address(url):
    if '#' in url:
        pass
    elif '.pdf' in url:
        pass
    else:
        local_urls.add(url) 

In [33]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from urllib.parse import urlparse
from collections import deque
import re

#url = "https://jmquintana79.github.io"
#url = "https://www.jma.go.jp/jma/indexe.html"
url = "https://www.endesa.com"
# a queue of urls to be crawled
new_urls = deque([url])

# a set of urls that we have already been processed 
processed_urls = set()
# a set of domains inside the target website
local_urls = set()
# a set of domains outside the target website
foreign_urls = set()
# a set of broken urls
broken_urls = set()

# initialize counter
icount = 0
# process urls one by one until we exhaust the queue
while len(new_urls):
    # move next url from the queue to the set of processed urls
    url = new_urls.popleft()
    processed_urls.add(url)
    # get url's content
    print("Processing %s" %(url))
    try:
        response = requests.get(url)
    except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
        # add broken urls to it's own set, then continue
        broken_urls.add(url)
        continue
    
    # extract base url to resolve relative links
    parts = urlsplit(url)
    base = "{0.netloc}".format(parts)
    strip_base = base.replace("www.", "")
    base_url = "{0.scheme}://{0.netloc}".format(parts)
    path = url[:url.rfind('/')+1] if '/' in parts.path else url

    # create a beutiful soup for the html document
    soup = BeautifulSoup(response.text, "lxml")

    for link in soup.find_all('a'):
        # extract link url from the anchor
        anchor = link.attrs["href"] if "href" in link.attrs else ''

        if anchor.startswith('/'):
            local_link = base_url + anchor
            #local_urls.add(local_link)
            add_local_address(local_link)
        elif strip_base in anchor:
            #local_urls.add(anchor)
            add_local_address(anchor)
        elif not anchor.startswith('http'):
            local_link = path + anchor
            #local_urls.add(local_link)
            add_local_address(local_link)
        else:
            foreign_urls.add(anchor)

        # only local
        for i in local_urls:
            if not i in new_urls and not i in processed_urls:
                new_urls.append(i)
    # add counter
    icount += 1
    # stop
    if icount == 200:
        break


Processing https://www.endesa.com
Processing https://www.endesa.com/es
Processing https://www.endesa.com/es/prensa/sala-de-prensa/noticias/clientes/plan-especial-atencion-medidas-urgentes-La-Palma
Processing https://www.endesa.com/es/conoce-la-energia/energia-y-mas/tarifas-acceso-gas-natural
Processing https://www.endesa.com/es/clientes-endesa/clientes-hogares
Processing https://www.endesa.com/es/te-ayudamos/oficinas-y-puntos-de-servicio/servicio-videoatencion
Processing https://www.endesa.com/es/clientes-endesa
Processing https://www.endesa.com/es/te-ayudamos
Processing https://www.endesa.com/es/te-ayudamos/oficinas-y-puntos-de-servicio
Processing https://www.endesa.com/es/blog/blog-de-endesa
Processing https://www.endesa.com/es/blog/blog-de-endesa/sostenibilidad/energia-solar
Processing https://www.endesa.com/es/blog/blog-de-endesa/luz/alquiler-factura-luz-sin-pagar
Processing https://www.endesa.com/es/blog/blog-de-endesa/climatizacion/bono-social-termico
Processing https://www.endes

Processing https://www.endesa.com/en/endesa-customers
Processing https://www.endesa.com/ca/t-ajudem
Processing https://www.endesa.com/en/advice
Processing https://www.endesa.com/es/te-ayudamos/
Processing https://www.endesa.com/es/luz-y-gas/luz/one/tarifa-one-luz?int=movedt:oneluz:rit:es/te-ayudamos/oficinas-y-puntos-de-servicio
Processing https://www.endesa.com/es/luz-y-gas/gas/tarifa-one-gas?int=movedt:onegas:rit:es/te-ayudamos/oficinas-y-puntos-de-servicio
Processing https://www.endesa.com/ca/t-ajudem/oficines-i-punts-de-servei
Processing https://www.endesa.com/en/advice/offices-and-service-points
Processing https://www.endesa.com/es/blog/
Processing https://www.endesa.com/es/blog
Processing https://www.endesa.com/es/blog/?page=1&sort=newest
Processing https://www.endesa.com/es/blog/?page=1&sort=oldest
Processing https://www.endesa.com/ca/blogs/blog-d-endesa
Processing https://www.endesa.com/en/blogs/endesa-s-blog
Processing https://www.endesa.com/es/blog/blog-de-endesa/sostenibilid

In [34]:
foreign_urls

{'http://dam-echannel.enel.com/podcasthandler/116d227b-1335-4ab5-a2eb-19d4eecf78d5/audio.mp3',
 'http://dam-echannel.enel.com/podcasthandler/4591214a-2bf1-454e-8975-cfa00159385c/audio.mp3',
 'http://dam-echannel.enel.com/podcasthandler/7874a3da-b081-4b4f-bf28-0e11bad79677/audio.mp3',
 'http://streamstudio.world-television.com/CCUIv3/registration.aspx?ticket=184-185-22610&target=en-default-&status=ondemand&browser=ns-0-1-0-0-0',
 'http://support.apple.com/kb/ht1677?viewlocale=es_es',
 'http://windows.microsoft.com/es-es/windows-vista/block-or-allow-cookies',
 'http://www.boe.es/boe/dias/2017/10/07/pdfs/BOE-A-2017-11505.pdf',
 'http://www.endesa-gnv-bonneuil.fr',
 'http://www.esios.ree.es/',
 'http://www.involcan.org/pevolca/',
 'http://www.linkedin.com/shareArticle?mini=true&url=URLREPLACE',
 'http://www.man.es/man/coleccion/nuevas-adquisiciones/reloj-sol-duque-alba.html',
 'https://accounts.enel.com/samlsso?slo=true&spEntityID=ENDESA_WEB_ES&returnTo=https://www.endesaclientes.com/logou

In [35]:
local_urls

{'http://www.endesa.com/',
 'https://ahorraconendesa.com/',
 'https://ahorroconendesa.com',
 'https://ahorrogarantizadoendesa.com',
 'https://endesa.com',
 'https://sge-endesa.com',
 'https://tarifasendesa.com',
 'https://vuelaconendesa.com',
 'https://vuelaconendesa.com/',
 'https://vuelaconendesa.com/?int=conof:avs:owm:eees:gdc:bnf:end::fide::land:clientes-endesa:ctp:vrs:dual:vldend:pos:imt::::mtd::::__;!!OjemSMKBgg!yYqQYIjSiUg6sV7pvlIcCg4yTcDMtaVHT6rzx-KCeXeX3kRYlXwpx0h3t6YAR22oCA$',
 'https://vuelaconendesa.com/iberia',
 'https://vuelaconendesa.com/vueling',
 'https://www.endesa.com',
 'https://www.endesa.com/',
 'https://www.endesa.com/ca',
 'https://www.endesa.com/ca/',
 'https://www.endesa.com/ca/accessibilitat',
 'https://www.endesa.com/ca/altres-productes/aire-condicionat',
 'https://www.endesa.com/ca/altres-productes/calefaccio',
 'https://www.endesa.com/ca/altres-productes/infoenergia-cat',
 'https://www.endesa.com/ca/altres-productes/manteniment',
 'https://www.endesa.com/c