In [1]:
import csv
from collections import namedtuple
from itertools import combinations

import numpy as np
import pandas as pd
import pickle
from src.utils import check_hash, haversine_km, postcode_str, parse_coord_tuple

In [2]:
# Check Excel file hash (in case we accidentally changed some data in the file).
POSM_FILE = "./data/Poscode - Lat Long.xlsx"
assert check_hash(POSM_FILE)

# Import the Excel file. Keep the Postcode column as a string.
df_import = pd.read_excel(POSM_FILE, sheet_name="Sheet2", dtype={"POSTCODE": str})

In [3]:
df = df_import.rename(
    columns={
        "LOCATION": "location",
        "POSTCODE": "postcode",
        "POST_OFFICE": "district_0",
        "POST_OFFICE_1": "district_1",
        "STATE": "state",
        "DATEUPDATE": "last_updated",
        "POINT_X": "longitude",
        "POINT_Y": "latitude",
    }
)
display(df.head())
display(df.info())

Unnamed: 0,location,postcode,district_0,district_1,state,last_updated,longitude,latitude
0,Pusat Komersial & Perindustrian Gangsa Jaya,76100,Melaka,Melaka,Melaka,2020-10-02 00:00:00.0000000,102.264939,2.214094
1,Rimbun Kiara,70200,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
2,Rimbun Impian,70300,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
3,Rimbun Jasmin,70300,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
4,Nusari Bayu 2,71950,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71031 entries, 0 to 71030
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   location      71031 non-null  object 
 1   postcode      71031 non-null  object 
 2   district_0    71031 non-null  object 
 3   district_1    71031 non-null  object 
 4   state         71031 non-null  object 
 5   last_updated  71031 non-null  object 
 6   longitude     71031 non-null  float64
 7   latitude      71031 non-null  float64
dtypes: float64(2), object(6)
memory usage: 4.3+ MB


None

In [4]:
# White space and capitalisation.
for col in ["location", "postcode", "district_0", "district_1", "state"]:
    df[col] = df[col].apply(lambda x: x.strip().removesuffix("_x000D_"))
    df[col] = df[col].apply(lambda x: x[0].upper() + x[1:])

# We can drop location and date_updated, as it's not very useful.
df = df.drop(columns=["location", "last_updated"]).drop_duplicates()

# Combine longitude and latitude into 1 column.
# It makes for easier aggregate operations later.
df["lat_lon"] = list(zip(df["latitude"], df["longitude"]))
df = df.drop(columns=["latitude", "longitude"])

# Clean state column.
CLEANED_STATES = [
    "Johor",
    "Kedah",
    "Kelantan",
    "Melaka",
    "Negeri Sembilan",
    "Pahang",
    "Perak",
    "Perlis",
    "Pulau Pinang",
    "Sabah",
    "Sarawak",
    "Selangor",
    "Terengganu",
    "WP Kuala Lumpur",
    "WP Labuan",
    "WP Putrajaya",
]


def unclean_states():
    return sorted([s for s in df.state.unique() if s not in CLEANED_STATES])

print("Unclean states:", unclean_states())

clean_state_dict = {
    "WP kuala Lumpur": "WP Kuala Lumpur",
}

df.state = df.state.apply(
    lambda x: clean_state_dict[x] if x in clean_state_dict.keys() else x
)
print("Unclean states:", unclean_states())
assert unclean_states() == []

# Why are there two district fields? After cleaning, they are the same.
df_districts = df.loc[:, ["district_0", "district_1"]].drop_duplicates()
for col in df_districts:
    print(f"{col} has {len(df_districts[col].unique())} unique values.")
assert len(df_districts.district_0.unique()) == len(df_districts.district_0.unique())

# We can drop district_0 and rename district_1 to district.
df = df.drop(columns="district_0").rename(columns={"district_1": "district"})
df.head()

# Add PPV regions: KL, Selangor & Putrajaya are the same region
df["ppv_region"] = df["state"].apply(
    lambda x: "Selangor/KL/Putrajaya"
    if x in ["Selangor", "WP Kuala Lumpur", "WP Putrajaya"]
    else x
)

Unclean states: ['WP kuala Lumpur']
Unclean states: []
district_0 has 437 unique values.
district_1 has 437 unique values.


In [5]:
FIXED_DATA_FILE = "./data/bad_postcode_data_Thev.xlsx"
assert check_hash(FIXED_DATA_FILE)
df_import2 = pd.read_excel(FIXED_DATA_FILE, sheet_name="Sheet1", converters={"postcode": postcode_str, "lat_lon": parse_coord_tuple})
df_fix = df_import2[["postcode", "state", "lat_lon"]].set_index("postcode")
df_fix

Unnamed: 0_level_0,state,lat_lon
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
2700,Perlis,"(6.342472437999014, 100.18982195228926)"
2707,Perlis,"(6.342472437999014, 100.18982195228926)"
2709,Perlis,"(6.342472437999014, 100.18982195228926)"
2800,Perlis,"(6.342472437999014, 100.18982195228926)"
14100,Pulau Pinang,"(5.283955, 100.476943)"
14110,Pulau Pinang,"(5.283955, 100.476943)"
14120,Pulau Pinang,"(5.283955, 100.476943)"
6200,Kedah,"(6.216754689734435, 100.39997806495073)"
6207,Kedah,"(6.216754689734435, 100.39997806495073)"
6209,Kedah,"(6.216754689734435, 100.39997806495073)"


In [6]:
pc_fixed = df_fix.index.to_list()

def fix_coords(series) -> tuple:
    pc = series["postcode"]
    if pc in pc_fixed:
        return df_fix.loc[pc]["lat_lon"]
    else:
        return series["lat_lon"]

df["fixed_lat_lon"] = df.apply(fix_coords, axis="columns")
df[df["lat_lon"] != df["fixed_lat_lon"]].sort_values("postcode")

Unnamed: 0,postcode,district,state,lat_lon,ppv_region,fixed_lat_lon
2799,2700,Simpang Ampat,Perlis,"(5.283955, 100.476943)",Perlis,"(6.342472437999014, 100.18982195228926)"
3624,2707,Simpang Ampat,Perlis,"(5.283955, 100.476943)",Perlis,"(6.342472437999014, 100.18982195228926)"
3625,2709,Simpang Ampat,Perlis,"(5.283955, 100.476943)",Perlis,"(6.342472437999014, 100.18982195228926)"
3626,2800,Simpang Ampat,Perlis,"(5.283955, 100.476943)",Perlis,"(6.342472437999014, 100.18982195228926)"
6153,6200,Kepala Batas,Kedah,"(5.610519, 100.444822)",Kedah,"(6.216754689734435, 100.39997806495073)"
7573,6207,Kepala Batas,Kedah,"(5.610519, 100.444822)",Kedah,"(6.216754689734435, 100.39997806495073)"
7574,6209,Kepala Batas,Kedah,"(5.610519, 100.444822)",Kedah,"(6.216754689734435, 100.39997806495073)"
61055,9130,Baling,Kedah,"(2.930566, 112.542298)",Kedah,"(5.604487548645426, 100.77210312560088)"
9333,9130,Baling,Kedah,"(5.67511, 100.918537)",Kedah,"(5.604487548645426, 100.77210312560088)"
641,13200,Kepala Batas,Pulau Pinang,"(5.610519, 100.444822)",Pulau Pinang,"(5.218040857077913, 100.49509023385203)"


In [7]:
# Replace lat_lon with the fixed values
df["lat_lon"] = df["fixed_lat_lon"]
df = df.drop("fixed_lat_lon", axis="columns")


In [8]:
df

Unnamed: 0,postcode,district,state,lat_lon,ppv_region
0,76100,Melaka,Melaka,"(2.214094, 102.264939)",Melaka
1,70200,Seremban,Negeri Sembilan,"(2.71934, 101.941512)",Negeri Sembilan
2,70300,Seremban,Negeri Sembilan,"(2.71934, 101.941512)",Negeri Sembilan
4,71950,Seremban,Negeri Sembilan,"(2.71934, 101.941512)",Negeri Sembilan
5,43000,Kajang,Selangor,"(2.99319, 101.787)",Selangor/KL/Putrajaya
...,...,...,...,...,...
70583,88875,Kota Kinabalu,Sabah,"(5.982556, 116.074506)",Sabah
70584,89707,Bongawan,Sabah,"(5.52718611, 115.857611)",Sabah
70585,91128,Lahad Datu,Sabah,"(5.024247, 118.330774)",Sabah
70588,32100,TLDM Lumut,Perak,"(4.21121389, 100.641323)",Perak


In [9]:
# Helper function t
def postcode_level(postcode: str) -> int:
    """Returns the postcode level: number of digits before the ending string of zeroes.

    e.g. 54000 -> Level 2 postcode
         54680 -> Level 4 postcode
    """
    return len(postcode.rstrip("0"))

In [23]:
# Find all postcodes which have multiple lat_lon coordinates
temp = (
    df[["postcode", "lat_lon"]]
    .groupby("postcode")
    .count()
    .rename(columns={"lat_lon": "num_lat_lon"})
)
temp = temp[temp["num_lat_lon"] > 1].sort_values(by="num_lat_lon", ascending=False)
display(temp)
print("-----")
print("Example: postcode 21400:")
display(df[df["postcode"] == "21400"])

Unnamed: 0_level_0,num_lat_lon
postcode,Unnamed: 1_level_1
21400,3
6010,2
57000,2
81300,2
76100,2
75460,2
72120,2
71800,2
59200,2
53100,2


-----
Example: postcode 21400:


Unnamed: 0,postcode,district,state,lat_lon,ppv_region
4908,21400,Bukit Payong,Terengganu,"(5.232199, 103.101954)",Terengganu
14255,21400,Kuala Terengganu,Terengganu,"(5.337306, 103.13771)",Terengganu
61939,21400,Chalok,Terengganu,"(5.424328, 102.837014)",Terengganu


In [22]:
# Pick the smallest coordinates (tuple comparison always compares latitude first)
def mean_coord(data, **kwargs):
    lat_lon = data["lat_lon"]
    if len(lat_lon) > 1:
        print(lat_lon)
        print(mean(lat_lon))
gb = df[["postcode", "lat_lon"]].groupby("postcode").apply(mean_coord, axis=1)
display(gb)
display(df[df["postcode"] == "21400"])
# Change above to np.mean for the average position instead

TypeError: mean_coord() got an unexpected keyword argument 'axis'

In [None]:
# Create mapping of all postcodes to their master postcode

# Master postcode is the postcode with the most zeroes at the end. Break ties (e.g. 50000, 60000) by picking the smaller number.
def master_postcode_sort(postcode: str):
    return (postcode_level(postcode), int(postcode))

def master_postcode_agg(series):
    postcodes = series.to_list()
    return min(postcodes, key=master_postcode_sort)

df["master_postcode"] = (
    df[["postcode", "lat_lon"]].
    groupby(["lat_lon"]).
    transform(master_postcode_agg)
)
display(df.head())


In [None]:
# dataframe_postcodes
df_pc = df[
    ["postcode", "master_postcode", "state", "ppv_region", "lat_lon"]
].drop_duplicates()

display(df_pc)

In [None]:
# dataframe_masterpostcodes
# Sort by region for nicer matrix later.
df_mpc = df_pc.drop(columns="postcode").drop_duplicates()
df_mpc = df_mpc.sort_values(by=["ppv_region", "master_postcode"]) 

display(df_mpc)

In [None]:
# A list of dicts containing all valid master postcode pairings, to be converted to csv later.
mpc_pairs = []

# Generate distance matrix.
pc_labels = [(x, y) for (x, y) in zip(df_mpc["ppv_region"], df_mpc["master_postcode"])]
master_lat_lon = {x: y for (x, y) in zip(df_mpc["master_postcode"], df_mpc["lat_lon"])}

# dataframe_matrix
df_mat = pd.DataFrame(
    pd.NA, 
    index=pd.MultiIndex.from_tuples(pc_labels), 
    columns=pd.MultiIndex.from_tuples(pc_labels),
)  # type: ignore

# Diagonals are zero distance
for tup in pc_labels:
    df_mat.loc[tup, tup] = 0

# Off-diagonals, calculate the haversine distance.
for tup1, tup2 in combinations(pc_labels, 2):
    ppv1, mpc1 = tup1
    ppv2, mpc2 = tup2
    if ppv1 == ppv2:
        if mpc1 == mpc2:
            df_mat.loc[tup1, tup2] = 0
        else:
            lat1, lon1 = master_lat_lon[mpc1]
            lat2, lon2 = master_lat_lon[mpc2]
            distance = haversine_km(lat1, lon1, lat2, lon2)
            df_mat.loc[tup1, tup2] = distance
            df_mat.loc[tup2, tup1] = distance  # reflect across the diagonal
            # This is a valid pair, so add it to the list.
            mpc_pairs.append({
                "ppv_region": ppv1,
                "master_postcode_1": mpc1,
                "latitude_1": lat1,
                "longitude_1": lon1,
                "master_postcode_2": mpc2,
                "latitude_2": lat2,
                "longitude_2": lon2, 
            })


In [None]:
# Save to Excel sheets:
with pd.ExcelWriter("./output/postcode_output.xlsx") as writer:
    df_pc.to_excel(writer, sheet_name="Postcodes")
    df_mpc.to_excel(writer, sheet_name="Master Postcodes")
    df_mat.to_excel(writer, sheet_name="Master Postcode Distance Matrix")

# Save the master postcode pairs as a csv
with open("./output/mpc_pairs.csv", mode="w", newline='') as fp:
    writer = csv.DictWriter(fp, fieldnames=mpc_pairs[0].keys())
    writer.writeheader()
    for pair in mpc_pairs:
        writer.writerow(pair)



In [None]:
with open("./output/distance_matrix.pickle", mode="wb") as fp:
    pickle.dump(df_mat, fp)

In [None]:
num = len(mpc_pairs)
print(f"There are a total of {num} valid master postcode pairs.")
print(f"Total cost in Maps API: USD{num * 0.005:2g}.")
print("-----")
print(f"You can find the data output files in the `output` folder.")
