# Scraping the Inter-Municipal Distances from the Web

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings()

import time
import random

import networkx as nx
import pickle

### Getting a list of all the nodes in the network

In [4]:
G = nx.read_gpickle('networks/OD.gpickle')

nodes = list(G.nodes())
nodes[:5]

['Aabenraa', 'Aalborg', 'Aarhus', 'Albertslund', 'Allerød']

A lot of the municipality names contain Danish special characters, so we replace these to match the way they are spelled in the urls we are gonna crawl through.

In [6]:
web_friendly = {}
for muni in nodes:
    friendly = muni.lower().replace('æ','ae').replace('ø','o').replace('å','aa')
    web_friendly[muni] = friendly
    
web_friendly['Nordfyns'] = 'nordfyn'
web_friendly['Tårnby'] = 'tarnby'
web_friendly['Vesthimmerlands'] = 'vesthimmerland'

error = {'Nordfyns', 'Tårnby', 'Vesthimmerlands'}

Making a set of disconnected nodes, that is, with a degree of 0, to add edges to the network later.

In [9]:
missing = {'Billund',
           'Bornholm',
           'Dragør',
           'Fanø',
           'Gribskov',
           'Halsnæs',
           'Jammerbugt',
           'Langeland',
           'Lemvig', 
           'Læsø', 
           'Morsø', 
           'Norddjurs', 
           'Nordfyns', 
           'Odder', 
           'Samsø', 
           'Stevns', 
           'Syddjurs', 
           'Vesthimmerlands',
           'Ærø'}

Making a list of tuples representing all the edges I need to scrape distances for.

In [11]:
def make_edges():
    new_edges = []
    for origin in missing:
        for destination in G.nodes:
            e = (origin, destination)
            new_edges.append(e)
    return new_edges

edges = [edge for edge in list(G.edges)]
new_edges = make_edges()
print(new_edges[:4])

all_edges = edges + new_edges

[('Dragør', 'Aabenraa'), ('Dragør', 'Aalborg'), ('Dragør', 'Aarhus'), ('Dragør', 'Albertslund')]


# Making the scraper

We scrape the distances from the website https://www.afstande.com/ as part of a string.  

The string contains the distance both in kilometers and travel time.

In [12]:
def scraper(muni1, muni2):
    URL = f"https://www.afstande.com/afstand/{muni1}/{muni2}/"

    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(URL, verify = False).text
    
    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
    
    # extract the string we need
    text = soup.find_all('p')[0]
    
    return text

In [33]:
def get_data(list_of_edges):
    outfile = open('data/distancedata.csv', 'a', encoding='UTF-8')
    travel_data = {}
    n = 0
    
    for edge in list_of_edges:
        n += 1
        muni1 = edge[0]
        muni2 = edge[1]
        if (muni1, muni2) not in travel_data and (muni2, muni1) not in travel_data:
            _muni1 = web_friendly[muni1]
            _muni2 = web_friendly[muni2]
            text = str(scraper(_muni1, _muni2))
            travel_data[(muni1, muni2)] = text
            time.sleep(random.randint(1,4)) # adding a timeout so as not to overload their servers
        if n % 10 == 0:
            progress = round((n / (len(edges))) * 100, 2)
            print('Progress: ' + str(progress) + ' %')
        line = str(edge)+';'+ text + '\n'
        outfile.write(line)
    outfile.close()
    return travel_data

In [48]:
get_data(all_edges)

Progress: 0.36 %
Progress: 0.73 %
Progress: 1.09 %
Progress: 1.46 %
Progress: 1.82 %
Progress: 2.18 %
Progress: 2.55 %
Progress: 2.91 %
Progress: 3.28 %
Progress: 3.64 %
Progress: 4.0 %
Progress: 4.37 %
Progress: 4.73 %
Progress: 5.09 %
Progress: 5.46 %
Progress: 5.82 %
Progress: 6.19 %
Progress: 6.55 %
Progress: 6.91 %
Progress: 7.28 %
Progress: 7.64 %
Progress: 8.01 %
Progress: 8.37 %
Progress: 8.73 %
Progress: 9.1 %


{('Aabenraa',
  'Tårnby'): '<p>Afstanden mellem Aabenraa og Tårnby er 212 kilometer og køreafstand er 285 kilometer. Den forventede køretid fra Aabenraa til Tårnby er 2 timer 50 minutter.</p>',
 ('Aalborg',
  'Tårnby'): '<p>Afstanden mellem Aalborg og Tårnby er 228 kilometer og køreafstand er 307 kilometer. Den forventede køretid fra Aalborg til Tårnby er 3 timer 44 minutter.</p>',
 ('Aarhus',
  'Tårnby'): '<p>Afstanden mellem Aarhus og Tårnby er 160 kilometer og køreafstand er 190 kilometer. Den forventede køretid fra Aarhus til Tårnby er 2 timer 35 minutter.</p>',
 ('Assens',
  'Tårnby'): '<p>Afstanden mellem Assens og Tårnby er 175 kilometer og køreafstand er 196 kilometer. Den forventede køretid fra Assens til Tårnby er 2 timer 9 minutter.</p>',
 ('Brønderslev',
  'Tårnby'): '<p>Afstanden mellem Brønderslev og Tårnby er 245 kilometer og køreafstand er 336 kilometer. Den forventede køretid fra Brønderslev til Tårnby er 4 timer 1 minutter.</p>',
 ('København',
  'Tårnby'): '<p>Afstan