In [1]:
import csv
import itertools

import openpyxl
import pandas as pd

import src.utils


In [2]:
# Import in df_pc from Notebook 2
df_pc = pd.read_pickle("./output/posm-postcodes-full.pkl")

# Get just the master postcodes. Sort by region for a nicer matrix later.
df_mpc = df_pc.drop(columns="postcode").drop_duplicates()
df_mpc = df_mpc.sort_values(by=["ppv_region", "master_postcode"])

display(df_mpc)

Unnamed: 0,master_postcode,state,ppv_region,lat,lon
708,79000,Johor,Johor,1.425136,103.614430
52400,80000,Johor,Johor,1.456123,103.761701
4237,81000,Johor,Johor,1.662964,103.600178
51,81300,Johor,Johor,1.497200,103.711685
3800,81400,Johor,Johor,1.606506,103.647617
...,...,...,...,...,...
15261,24060,Terengganu,Terengganu,4.230972,103.427966
6654,24100,Terengganu,Terengganu,4.335356,103.479837
6690,24200,Terengganu,Terengganu,4.426911,103.452517
7300,24300,Terengganu,Terengganu,4.500320,103.440871


 # Distance Matrix

 Generate a distance matrix: the haversine distance between any two master postcodes.
 Postcodes in different `ppv_region`s will be blank (NA) as we can't travel across borders.

 Since this is quite expensive computation-wise, we will also generate a csv which
 stores all the valid master postcode pairings and coordinates. This will be used for for feeding into Google Maps API.

In [3]:
# A list of dicts containing all valid master postcode pairings, to be converted to csv later.
mpc_pairs = []

# Generate distance matrix.
pc_labels = [(x, y) for x, y in zip(df_mpc["ppv_region"], df_mpc["master_postcode"])]
master_lat_lon = {x: (y, z) 
    for x, y, z 
    in zip(df_mpc["master_postcode"], df_mpc["lat"], df_mpc["lon"])
}

# dataframe_matrix
df_mat = pd.DataFrame(
    pd.NA,
    index=pd.MultiIndex.from_tuples(pc_labels),
    columns=pd.MultiIndex.from_tuples(pc_labels),
)

# Diagonals are zero distance
for tup in pc_labels:
    df_mat.loc[tup, tup] = 0

# Off-diagonals, calculate the haversine distance.
for tup1, tup2 in itertools.combinations(pc_labels, 2):
    ppv1, mpc1 = tup1
    ppv2, mpc2 = tup2
    if ppv1 == ppv2:
        if mpc1 == mpc2:
            df_mat.loc[tup1, tup2] = 0
        else:
            lat1, lon1 = master_lat_lon[mpc1]
            lat2, lon2 = master_lat_lon[mpc2]
            distance = src.utils.haversine_km(lat1, lon1, lat2, lon2)
            df_mat.loc[tup1, tup2] = distance
            df_mat.loc[tup2, tup1] = distance  # reflect across the diagonal
            # This is a valid pair, so add it to the list.
            mpc_pairs.append(
                {
                    "ppv_region": ppv1,
                    "master_postcode_1": mpc1,
                    "latitude_1": lat1,
                    "longitude_1": lon1,
                    "master_postcode_2": mpc2,
                    "latitude_2": lat2,
                    "longitude_2": lon2,
                }
            )

display(df_mat)

Unnamed: 0_level_0,Unnamed: 1_level_0,Johor,Johor,Johor,Johor,Johor,Johor,Johor,Johor,Johor,Johor,...,Terengganu,Terengganu,Terengganu,Terengganu,Terengganu,Terengganu,Terengganu,Terengganu,Terengganu,WP Labuan
Unnamed: 0_level_1,Unnamed: 1_level_1,79000,80000,81000,81300,81400,81440,81450,81500,81550,81600,...,23200,23300,23400,24000,24050,24060,24100,24200,24300,87000
Johor,79000,0,16.718734,26.47602,13.448168,20.489126,50.102419,36.733454,14.816449,3.701622,55.620452,...,,,,,,,,,,
Johor,80000,16.718734,0,29.15918,7.190763,20.973199,49.457668,34.170561,28.579943,19.022374,40.170806,...,,,,,,,,,,
Johor,81000,26.47602,29.15918,0,22.197841,8.19327,23.72382,12.638248,19.979265,23.86066,65.792808,...,,,,,,,,,,
Johor,81300,13.448168,7.190763,22.197841,0,14.078112,43.46905,28.477578,22.474711,14.496835,46.80334,...,,,,,,,,,,
Johor,81400,20.489126,20.973199,8.19327,14.078112,0,30.178914,16.244601,18.859817,18.668948,58.141493,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Terengganu,24060,,,,,,,,,,,...,65.384164,40.133535,49.484964,12.154111,24.307933,0,12.945771,21.943034,29.965463,
Terengganu,24100,,,,,,,,,,,...,58.007021,32.268003,43.515205,20.107166,30.60046,12.945771,0,10.61483,18.833153,
Terengganu,24200,,,,,,,,,,,...,47.985241,22.440356,34.391023,24.557076,31.936762,21.943034,10.61483,0,8.258988,
Terengganu,24300,,,,,,,,,,,...,40.942914,16.313085,28.854912,30.893018,36.140766,29.965463,18.833153,8.258988,0,


# Export files

In [5]:
# Save the distance matrix.
NAME = "distance-matrix"

# Excel for easy human reading, since it preserves MultiIndex.
EXCEL_NAME = f"./output/{NAME}.xlsx"
df_mat.to_excel(EXCEL_NAME)
workbook = openpyxl.load_workbook(EXCEL_NAME)
workbook["Sheet1"].freeze_panes = "C3"  # Slightly nicer to scroll through.
workbook.save(EXCEL_NAME)

# Save as pickle for future notebooks.
df_mat.to_pickle(f"./output/{NAME}.pkl")


In [4]:
# Save the master postcode pairs for later notebooks:
with open("./output/master-postcode-pairs.csv", mode="w", newline="") as fp:
    writer = csv.DictWriter(fp, fieldnames=mpc_pairs[0].keys())
    writer.writeheader()
    for pair in mpc_pairs:
        writer.writerow(pair)

# Estimated Google Maps API cost

In [6]:
num = len(mpc_pairs)
print(f"There are a total of {num} valid master postcode pairs.")
print(f"Total cost in Maps API: USD{num * 0.005:2g}.")
print("-----")
print(f"You can find the data output files in the `output` folder.")


There are a total of 8602 valid master postcode pairs.
Total cost in Maps API: USD43.01.
-----
You can find the data output files in the `output` folder.
