# Simple script to calculate the distance between the streets in Waterloo and the location of the University of Waterloo

In [1]:
from geopy.geocoders import GoogleV3
from geopy.distance import vincenty
from geopy.exc import GeocoderTimedOut, GeocoderQuotaExceeded
from multiprocessing import Pool

import pandas as pd
import time
import sys

#### A list of street names in Waterloo, Ontario can be found here: https://geographic.org/streetview/canada/on/city_of_waterloo.html
The list of street names are already saved in a file named ** Waterloo_Street.txt ** in the ** data ** directory.

In [2]:
waterloo_street = pd.read_csv("./data/Waterloo_Street.txt")
waterloo_street["address"] = waterloo_street["street name"].apply(lambda str: str + ", Waterloo, Ontario, Canada")

waterloo_street

Unnamed: 0,street name,address
0,Abbotswood Court,"Abbotswood Court, Waterloo, Ontario, Canada"
1,Aberdeen Road,"Aberdeen Road, Waterloo, Ontario, Canada"
2,Academy Crescent,"Academy Crescent, Waterloo, Ontario, Canada"
3,Acadia Court,"Acadia Court, Waterloo, Ontario, Canada"
4,Ainsworth Court,"Ainsworth Court, Waterloo, Ontario, Canada"
5,Albert Street,"Albert Street, Waterloo, Ontario, Canada"
6,Alexandra Avenue,"Alexandra Avenue, Waterloo, Ontario, Canada"
7,Alexmuir Place,"Alexmuir Place, Waterloo, Ontario, Canada"
8,Algonquin Drive,"Algonquin Drive, Waterloo, Ontario, Canada"
9,Allen Street East,"Allen Street East, Waterloo, Ontario, Canada"


## Calculate distance for each of the data
Vincenty's formulae is used to calculate the distance (km) between the street in Waterloo and the location of the University of Waterloo.

In [3]:
def calculate_distance(address):
    geolocator = GoogleV3() # Google Maps API
    
    try:
        housing = geolocator.geocode(address, timeout=10)
        housing_coordinates = (housing.latitude, housing.longitude)
    
    except GeocoderTimedOut:
        return calculate_distance(address)
    
    except GeocoderQuotaExceeded:
        # Google Maps (standard API) only allows:
        #     1.) 2,500 free requests per day, calculated as the sum of client-side and server-side queries.
        #     2.) 50 requests per second, calculated as the sum of client-side and server-side queries.
        return calculate_distance(address)
    
    except:
        print ("Unexpected error:", sys.exc_info()[0])  # Hack to check the type of exception thrown by Geocoder
        return "Error: Could not find the coordinates for address: %s" % address

    UW_coordinates = (43.469757, -80.5409518)
    
    distance = vincenty(housing_coordinates, UW_coordinates).kilometers
    return round(distance, 3)

Here, we utilize the benefit of multiprocessing. First, we slice the dataframe into few chunks (ie. each chunk of size 10). Then, we apply the * calculate_distance * function on these chunks concurrently using multiple processors. There is a ** 10x ** speed faster using multiprocessing.

In [4]:
###### Apply function on DataFrame in parallel Using Multiprocessing #######
# https://stackoverflow.com/questions/26187759/parallelize-apply-after-pandas-groupby/29281494#29281494

process_num = 20

def applyParallel(dfGrouped, func):
    grouped_list = [group for _, group in dfGrouped]
    
    with Pool(processes=process_num) as p:
        ret_list = p.map(func, grouped_list)
        p.close()
        p.join()
    return pd.concat(ret_list)

def apply_row_df(df):
    df["distance (km)"] =  df['address'].apply(calculate_distance)
    return df

grouped = waterloo_street.groupby(waterloo_street.index // 10) # Partition into chunks of size 10

tic = time.time()
waterloo_street = applyParallel(grouped, apply_row_df)
toc = time.time()

print("Total execution time using multiprocessing:  {}s".format(toc - tic))

Total execution time using multiprocessing:  95.18907070159912s


#### Ignore waterloo streets that are too far from the University of Waterloo (i.e. distance that is > 2.5). Sort the data according to the distance.

In [5]:
waterloo_street = waterloo_street[waterloo_street["distance (km)"]<=2.5]
nearby_street = waterloo_street.sort_values("distance (km)")

nearby_street

Unnamed: 0,street name,address,distance (km)
645,Ring Road,"Ring Road, Waterloo, Ontario, Canada",0.283
688,Seagram Drive,"Seagram Drive, Waterloo, Ontario, Canada",0.534
406,Iroquois Place,"Iroquois Place, Waterloo, Ontario, Canada",0.638
762,Tennyson Place,"Tennyson Place, Waterloo, Ontario, Canada",0.660
453,Lester Street,"Lester Street, Waterloo, Ontario, Canada",0.696
548,North Campus Road,"North Campus Road, Waterloo, Ontario, Canada",0.741
8,Algonquin Drive,"Algonquin Drive, Waterloo, Ontario, Canada",0.749
696,Shakespeare Place,"Shakespeare Place, Waterloo, Ontario, Canada",0.757
752,Sunview Street,"Sunview Street, Waterloo, Ontario, Canada",0.764
466,Longfellow Drive,"Longfellow Drive, Waterloo, Ontario, Canada",0.766


#### Write the data to a csv file. 
Only write the street names into the file.

In [6]:
nearby_street = nearby_street[["street name"]]
nearby_street.to_csv(path_or_buf="./data/nearby_street.txt", index=False, header=False)