In [1]:
import pickle
from geocoding_getter import *
import numpy as np
import pandas as pd

# Please don't simply run all code blocks; some will overwrite existing cached geocodings!! (Even though we have git to back us up)

In [None]:
geocodings = dict()

In [94]:
with open("../data/tmp/geocodings.pickle", 'wb') as f:
    pickle.dump(geocodings, f, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
with open("../data/tmp/geocodings.pickle", 'rb') as f:
    geocodings = pickle.load(f)

In [20]:
DATA_FOLDER = "../data/"
PREFIX = "Mobi_System_Data_"
VERBOSE = False

In [67]:
years = [2017, 2018, 2019, 2020, 2021]

for y in years:
    df = pd.read_csv(DATA_FOLDER + PREFIX + str(y) + ".csv", 
                    compression="zip")
    
    print("Processing year " + str(y) + " ...")
        
    _ = df.apply(process_geocoding, axis=1)

Processing year 2017 ...
Processing year 2018 ...
***NotFound*** 0994 station for tests
***NotFound*** station for tests
***NotFound*** 0999 Bike Production
***NotFound*** Bike Production
***NotFound*** 0997 HQ Workshop
***NotFound*** HQ Workshop
***NotFound*** 222-0999.4 Smoove_test_4
***NotFound*** Smoove_test_4
***NotFound*** 223-0999.5 Smoove_test_5
***NotFound*** Smoove_test_5
Processing year 2019 ...
***NotFound*** 0991 HQ Workshop
***NotFound*** HQ Workshop
***NotFound*** 1000 Temporary Event: Granville Island Bike Valet
***NotFound*** Temporary Event: Granville Island Bike Valet
***NotFound*** 1000 Fireworks Mobi Bike Valet (brought to you by BEST/Translink)
***NotFound*** Fireworks Mobi Bike Valet (brought to you by BEST/Translink)
***NotFound*** 0980 Workshop - Balancer Bike Check In
***NotFound*** Workshop - Balancer Bike Check In
***NotFound*** 9999 Smoove_atelier
***NotFound*** Smoove_atelier
***NotFound*** 0982 Workshop - Bike Testing
***NotFound*** Workshop - Bike Testin

In [60]:
def process_geocoding(row):
    _type = ["Departure", "Return"]
    
    for t in _type:
        station_name = row[f"{t} station"]
        
        if isinstance(station_name, str):
            get_geocoding(station_name)
    
    return

In [83]:
def get_geocoding(station):
    if VERBOSE:
        print("looking for ... " + station)
    
    if station in geocodings:
        return geocodings[station]
    else:
        try:
            lookup_res = geocoder(station)
        except NotFound:
            try:
                # Make one more attempt without the station's id
                _split = station.split(' ', 1)
                lookup_res = geocoder(_split[1])
            except NotFound:
                lookup_res = (None, None, None)

        geocodings[station] = lookup_res
        
        return lookup_res

In [5]:
def apply_geocoding(row):
    _type = ["Departure", "Return"]
    
    for t in _type:
        station_name = row[f"{t} station"]
        
        # Equivalent of comparing it with np.nan, check if NaN is passed in
        if not isinstance(station_name, str):
            values = (None, None, None)
        else:
            values = get_geocoding(station_name)
        
        tmp_series = pd.Series(data=values, 
                               index=[f"{t} postal code",
                                      f"{t} lat",
                                      f"{t} long"])
        
        row = row.append(tmp_series)

    return row

In [None]:
geocoded_df = df.apply(apply_geocoding, axis=1)

In [93]:
len(geocodings)

254

In [None]:
geocoded_df.to_csv("../tmp_2017_geocoded.csv", compression="zip")

In [87]:
to_pop = []

In [88]:
# Check script
for k, v in geocodings.items():
#     if not v[0] or len(v[0]) != 6:
    if not v[-1]:
        to_pop.append(k)

In [92]:
for s in to_pop:
    geocodings.pop(s)

In [91]:
with open("../data/not_found_stations.txt", 'w') as f:
    for item in to_pop:
        f.write("%s\n" % item)

In [102]:
items = [[k, v[0], v[1], v[2]] for k, v in geocodings.items()]
geocodings_df = pd.DataFrame(items)
geocodings_df.rename(columns={0: "address", 1: "postal_code", 2: "lat", 3: "long"}, 
                     inplace=True)

In [105]:
geocodings_df.to_csv("../data/geocodings.csv")