In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import pandas as pd
import numpy as np
import glob
import os
import time

In [None]:
geolocator = Nominatim(user_agent='getGeoInfo')
def getLatLon(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else:
            return pd.Series([None, None])
    except GeocoderTimedOut:
        return getLatLon(address)

input_dir = './data/ungeocoded'
output_dir = './data/geocoded'

In [None]:
# gets the longitude and latitude of a address
geolocator = Nominatim(user_agent='getGeoInfo')
def getLatLon(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return pd.Series([location.latitude, location.longitude, location.address])
        else:
            return pd.Series([None, None, None])
    except GeocoderTimedOut:
        return getLatLon(address)

## Geocoding Addresses of Sold Homes

In [None]:
# this step takes a long time... uncomment to run
input_dir = './data/ungeocoded'
output_dir = './data/geocoded'

# input_dir = './data/predict/ungeocoded'
# output_dir = './data/predict/geocoded'

In [None]:
csv_files = glob.glob(os.path.join(input_dir, '*ungeocoded.csv'))
for file in csv_files:
    city = os.path.basename(file).replace('-ungeocoded.csv', '')
    output_file = os.path.join(output_dir, f'{city}-geocoded.csv')

   #be sure to delete the old file (or rename it something else) if you want to recode it
    if os.path.exists(output_file):
        print(f'{city} already exists! skipping...')
        continue
    print(f'processing: {city}...')
    
    df = pd.read_csv(file)
    df[['lat', 'lon', 'addressDetails']] = df['address'].apply(getLatLon).apply(pd.Series)
    df.to_csv(output_file, index=False)

# Geocoding City Centers (Feature)

In [None]:
# as this is not information achievable from web scraping or through other data, it has been found manually
#the comments on the side of the address is the location, can remove at the end of project
city_center_mapping = {
    'Vancouver' : '701 W Georgia St, Vancouver, BC', #CF Pacific Centre
    'Surrey' : '10153 King George Blvd, Surrey, BC', #Central city mall
    'Langley' : '20300 Fraser Hwy, Langley, BC', #city centre square
    'Richmond' : '6551 No. 3 Rd, Richmond, BC', #CF Richmond Centre
    'Burnaby' : '4700 Kingsway, Burnaby, BC', #Metrotown
    'Delta' : '1299A 56 St, Delta, BC', #Tsawwassen Town Centre Mall
    'Pitt Meadows' : '12007 Harris Rd, Pitt Meadows, BC', #Pit Meadows City Hall (schools, park, shopping mall, transit nearby)
    'New Westminster' : '777 Columbia St, New Westminster, BC', #Anvil Centre (college, transit, restaraunts near)
    'White Rock' : '15400 Marine Dr, White Rock, BC', #Totem park (near the beach)
    'Coquitlam' : '2929 Barnet Hwy #2201, Coquitlam, BC', #Coquitlam centre
    'Abbotsford' : '33498 Bevan Ave, AbbotsFord, BC', #McCallum Centre (close to restaraunts, transit, school)
    'Maple Ridge' : '11900 Haney Pl, Maple Ridge, BC' #Haney Place Mall, centre of downtown area it seems
}

data = []

for city, city_center_address in city_center_mapping.items():
    cityLat, cityLon = getLatLon(city_center_address)
    data.append({'city': city, 'cityCenterLat': cityLat, 'cityCenterLon': cityLon})

df_cc = pd.DataFrame(data)
df_cc.to_csv('./data/features/cityCenters/centers.csv', index=False)

## Geocode Public Schools

In [None]:
# data source: https://bcschoolcontacts.gov.bc.ca/
school_csv_files = glob.glob(os.path.join('./data/features/schools/', '*.csv'))
district_dfs = [pd.read_csv(csv) for csv in school_csv_files]

valid_cities = set(city_center_mapping.keys())

schools_df = pd.concat(district_dfs, ignore_index=True)
schools_df = schools_df[(schools_df['School Category Code'] == 'Public' ) | (schools_df['School Category Code'] == 'Independent')]
schools_df = schools_df[schools_df['Facility Type Code'] == 'Standard'].reset_index()
schools_df = schools_df.rename(columns={'District Number': 'district', 
                                        'Display Name': 'name', 
                                        'Mailing Address': 'streetAddress', 
                                        'Mailing Address City': 'city', 
                                        'Mailing Address PostalCode': 'postalCode'})

# Clean addresses
schools_df['streetAddress'] = schools_df['streetAddress'].str.title()
schools_df['city'] = schools_df['city'].str.title()
schools_df = schools_df[schools_df['city'].isin(valid_cities)]
schools_df = schools_df[['district', 'name', 'streetAddress', 'city', 'postalCode']]

schools_df['address'] = schools_df['streetAddress'] + ', ' + schools_df['city'] + ', BC, Canada'

In [None]:
schools_df[['lat', 'lon']] = schools_df['address'].apply(getLatLon).apply(pd.Series)

In [None]:
# Some schools use Po BOX mailing addresses. Exclude those.
schools_df = schools_df.dropna()
schools_df.to_csv('./data/features/schools/schools_total.csv')