In [16]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup

In [5]:
### Set variables ###
local_folder = 'C:/Users/hcarl/source/repos/distance_request'
file_name = 'data_flyid_2019-04-08_clean.csv'

In [None]:
# url construction + web logic in one separate function that can be re-used and applied across rows
def get_route_and_distance(start, dest):

    # if we don't have a start or a destination, no need to look anything up!
    if pd.isna(start) or pd.isna(dest):
        return None, None

    my_url = "https://www.distance.to/" + start + "/" + dest
    page = requests.get(my_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    route = soup.find(class_="main-route trip").get_text()
    dist_calc = soup.find(class_="value km").get_text()
    return route, dist_calc

In [101]:
# url construction + web logic in one separate function that can be re-used and applied across rows
def get_distance(row):
    start = row['start']
    dest = row['destination']
    
    print('Getting distance...')
    
    # if we don't have a start or a destination, no need to look anything up!
    if pd.isna(start) or pd.isna(dest):
        return None, None

    my_url = "https://www.distance.to/" + start + "/" + dest
    page = requests.get(my_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    dist_calc = soup.find(class_="value km").get_text()
    print('Got distance.')
    return dist_calc

In [105]:
%time get_route_and_distance('London', 'Paris')
# how long does one of these requests take? 

Wall time: 317 ms


('London → Paris', '342.87')

It takes around 300ms for 1 row, so for 2000 rows it should take around 600s; roughly 10 minutes. 
Most of this time is probably spent waiting for the request to return, and we could parallelise this if speed became important. 

In [52]:
# load input data set (csv)
data_full = pandas.read_csv(os.path.join(local_folder, file_name), encoding = "ISO-8859-1", 
                       sep=',', error_bad_lines=False, na_values=".", index_col=0)

In [106]:
# turn the column labels into a multi-column index, so we have multiple starts and multiple destinations

fields = ['destination', 'start']
indices = range(1, 47)
column_names = [(field, index) for field in fields for index in indices]
data_new = data.sort_index(axis=1).copy()
data_new.columns = pd.MultiIndex.from_tuples(column_names)
data_new.head()

Unnamed: 0_level_0,destination,destination,destination,destination,destination,destination,destination,destination,destination,destination,...,start,start,start,start,start,start,start,start,start,start
Unnamed: 0_level_1,1,2,3,4,5,6,7,8,9,10,...,37,38,39,40,41,42,43,44,45,46
CASE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
102,,,,,,,,,,,...,,,,,,,,,,
103,,,,,,,,,,,...,,,,,,,,,,
105,Lissabon,,,,,,,,,,...,,,,,Tel Aviv,,,,,
109,,,,,,,,,,,...,,,,,,,,,,
110,,,,,,,,,,,...,,,,,,,,,,


In [107]:
# stack the data, so each row has one start and one destination
stacked = data_new.stack()

In [111]:
# calculate the distance for each row
stacked['distance'] = stacked.apply(get_distance, axis=1)

Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distan

Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.
Getting distance...
Got distance.


In [116]:
# unstack to get the data back to the way it was
output = stacked.unstack()

In [117]:
# have a look at the data
output

Unnamed: 0_level_0,destination,destination,destination,destination,destination,destination,destination,destination,destination,destination,...,distance,distance,distance,distance,distance,distance,distance,distance,distance,distance
Unnamed: 0_level_1,1,2,3,4,5,6,7,8,9,10,...,37,38,39,40,41,42,43,44,45,46
CASE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
102,,,,,,,,,,,...,,,,,,,,,,
105,Lissabon,,,,,,,,,,...,,,,,2852.60,,,,,
110,,,,,,,,,,,...,,,,,,,,,,
112,Baden-Baden,,,,,,,,,,...,,,,,1111.89,,,,,
113,,,,,,,,,,,...,,,,,,,,,,
116,,,,,,,,,,,...,,,,,,,,,,
117,"Guayaquil, ECU",,,,,,,,,,...,,,,,4205.91,,2104.85,,,
118,New York,,,,,,,,,,...,,,,,,,,,,
119,New York,,,,,,,,,,...,,,,,1515.08,,1515.08,,,
122,,,,,,,,,,,...,,,,,,,,,,
