In [1]:
import googlemaps
from src.utils import get_api_key, haversine_km, now
import pandas as pd
import json
import numpy as np
from datetime import datetime

def now() -> str:
    t = datetime.now()
    return t.strftime("%Y%m%d-%H%M%S")

# Check Pos Malaysia coordinates

Let's check that the coordinates from Pos Malaysia are close to their location in Google Maps.

We'll check only master postcodes to keep costs down.

In [2]:
df_import = pd.read_excel(
    "./output/postcode_output.xlsx", 
    sheet_name="Master Postcodes",
    converters={"master_postcode": str}
)

In [3]:
df = df_import[["master_postcode", "state", "lat_lon"]].set_index("master_postcode")
df.insert(loc=2, column="gmaps_lat_lon", value=pd.NA)
df["gmaps_lat_lon"] = df["gmaps_lat_lon"].astype("object")
display(df)
display(df.info())

Unnamed: 0_level_0,state,lat_lon,gmaps_lat_lon
master_postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79000,Johor,"(1.42513621, 103.61443042)",
80000,Johor,"(1.456123, 103.761701)",
81000,Johor,"(1.662964, 103.600178)",
81400,Johor,"(1.606506, 103.647617)",
81440,Johor,"(1.876001, 103.614046)",
...,...,...,...
24050,Terengganu,"(4.26868955, 103.2119044)",
24100,Terengganu,"(4.335356, 103.479837)",
24200,Terengganu,"(4.426911, 103.452517)",
24300,Terengganu,"(4.50032, 103.440871)",


<class 'pandas.core.frame.DataFrame'>
Index: 424 entries, 79000 to 87000
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   state          424 non-null    object
 1   lat_lon        424 non-null    object
 2   gmaps_lat_lon  0 non-null      object
dtypes: object(3)
memory usage: 13.2+ KB


None

In [4]:
num = len(df)
print(f"There are {num} master postcodes.")
print(f"This will cost USD{num * 0.05:.2f} to find coordinates from Google Maps API")

There are 424 master postcodes.
This will cost USD21.20 to find coordinates from Google Maps API


In [5]:
# Retrieve a small sample of postcodes
SAMPLE_SIZE = 10
df_sample = df.iloc[::20][:SAMPLE_SIZE]
display(df_sample)
print(f"Sample has {len(df_sample)} elements.")

Unnamed: 0_level_0,state,lat_lon,gmaps_lat_lon
master_postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79000,Johor,"(1.42513621, 103.61443042)",
83000,Johor,"(1.86841, 102.943824)",
85300,Johor,"(2.381831, 103.020378)",
6500,Kedah,"(6.145992, 100.432028)",
9500,Kedah,"(5.42056, 100.666936)",
17700,Kelantan,"(5.778147, 101.887721)",
78300,Melaka,"(2.350005, 102.108881)",
73000,Negeri Sembilan,"(2.468613, 102.231114)",
27100,Pahang,"(4.232898, 101.991298)",
39200,Pahang,"(4.413895, 101.381651)",


Sample has 10 elements.


In [6]:
SKIP = True
if not SKIP:
    gmaps = googlemaps.Client(key=get_api_key())
    responses = dict()
    ts = now()
    with open(f"./responses/geocode-{ts}.json", mode="a") as fp:
        for mpc, data in df_sample.iterrows():
            state = data["state"]
            try:
                response = gmaps.geocode(f"{mpc},{state},Malaysia")
                print(response)
            except Exception as exc:
                print(exc)
            responses[mpc] = response
        json.dump(responses, fp)

In [7]:
with open("./responses/geocode-20210604-220850.json") as fp:
    responses = json.load(fp)
for mpc, r in responses.items():
    mpc = str(mpc)
    location = r[0]["geometry"]["location"]
    gmaps_lat_lon = (location["lat"], location["lng"])
    df_sample.at[mpc, "gmaps_lat_lon"] = gmaps_lat_lon
display(df_sample)

Unnamed: 0_level_0,state,lat_lon,gmaps_lat_lon
master_postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79000,Johor,"(1.42513621, 103.61443042)","(1.4171501, 103.6566612)"
83000,Johor,"(1.86841, 102.943824)","(1.8539494, 102.9357477)"
85300,Johor,"(2.381831, 103.020378)","(2.4165848, 103.0126523)"
6500,Kedah,"(6.145992, 100.432028)","(6.140907100000001, 100.4115631)"
9500,Kedah,"(5.42056, 100.666936)","(6.1183964, 100.3684595)"
17700,Kelantan,"(5.778147, 101.887721)","(5.7932364, 101.9119871)"
78300,Melaka,"(2.350005, 102.108881)","(2.3478745, 102.0946818)"
73000,Negeri Sembilan,"(2.468613, 102.231114)","(2.491166, 102.2891314)"
27100,Pahang,"(4.232898, 101.991298)","(4.4429123, 102.0032957)"
39200,Pahang,"(4.413895, 101.381651)","(3.972832999999999, 102.433126)"


In [8]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 79000 to 39200
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   state          10 non-null     object
 1   lat_lon        10 non-null     object
 2   gmaps_lat_lon  10 non-null     object
dtypes: object(3)
memory usage: 620.0+ bytes


In [9]:
df_sample

Unnamed: 0_level_0,state,lat_lon,gmaps_lat_lon
master_postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
79000,Johor,"(1.42513621, 103.61443042)","(1.4171501, 103.6566612)"
83000,Johor,"(1.86841, 102.943824)","(1.8539494, 102.9357477)"
85300,Johor,"(2.381831, 103.020378)","(2.4165848, 103.0126523)"
6500,Kedah,"(6.145992, 100.432028)","(6.140907100000001, 100.4115631)"
9500,Kedah,"(5.42056, 100.666936)","(6.1183964, 100.3684595)"
17700,Kelantan,"(5.778147, 101.887721)","(5.7932364, 101.9119871)"
78300,Melaka,"(2.350005, 102.108881)","(2.3478745, 102.0946818)"
73000,Negeri Sembilan,"(2.468613, 102.231114)","(2.491166, 102.2891314)"
27100,Pahang,"(4.232898, 101.991298)","(4.4429123, 102.0032957)"
39200,Pahang,"(4.413895, 101.381651)","(3.972832999999999, 102.433126)"


In [10]:
def haversine_columns(df):
    print(df)
    lat1, lon1 = df["lat_lon"]
    lat2, lon2 = df["gmaps_lat_lon"]
    print(lat1, lon1, lat2, lon2)
    return haversine_km(lat1, lon1, lat2, lon2)
df_sample["haversine_km"] = df_sample[["lat_lon", "gmaps_lat_lon"]].apply(haversine_columns, axis=1)

# TODO: fix this apply function

lat_lon          (1.42513621, 103.61443042)
gmaps_lat_lon      (1.4171501, 103.6566612)
Name: 79000, dtype: object


ValueError: too many values to unpack (expected 2)