In [1]:
import pickle
from geocoding_getter import *
import numpy as np
import pandas as pd

# Please don't simply run all code blocks; some will overwrite existing cached geocodings!! (Even though we have git to back us up)

In [None]:
geocodings = dict()

In [None]:
with open("../data/geocodings.pickle", 'wb') as f:
    pickle.dump(geocodings, f, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
with open("../data/geocodings.pickle", 'rb') as f:
    geocodings = pickle.load(f)

In [3]:
DATA_FOLDER = "../data/raw_mobibikes_data/"
PREFIX = "Mobi_System_Data_"
VERBOSE = False

In [4]:
test_file = "2017.xlsx"
df = pd.read_excel(DATA_FOLDER + PREFIX + test_file)

In [5]:
def get_geocoding(station):
    if VERBOSE:
        print("looking for ... " + station)
    
    if station in geocodings:
        return geocodings[station]
    else:
        try:
            lookup_res = geocoder(station)
        except NotFound:
            try:
                # Make one more attempt without the station's id
                lookup_res = geocoder(station.split(' ', 1))
            except NotFound:
                lookup_res = (None, None, None)
                
        geocodings[station] = lookup_res
        
        return lookup_res

In [6]:
def apply_geocoding(row):
    _type = ["Departure", "Return"]
    
    for t in _type:
        station_name = row[f"{t} station"]
        
        # Equivalent of comparing it with np.nan, check if NaN is passed in
        if not isinstance(station_name, str):
            values = (None, None, None)
        else:
            values = get_geocoding(station_name)
        
        tmp_series = pd.Series(data=values, 
                               index=[f"{t} postal code",
                                      f"{t} lat",
                                      f"{t} long"])
        
        row = row.append(tmp_series)

    return row

In [7]:
geocoded_df = df.apply(apply_geocoding, axis=1)

In [8]:
len(geocodings)

133

In [11]:
geocoded_df.to_csv("../tmp_2017_geocoded.csv", compression="zip")

In [None]:
# Check script
for k, v in geocodings.items():
    if not v[0] or len(v[0]) != 6:
        print(k, v)