In [1]:
import json
import pickle
from functools import lru_cache
from itertools import combinations

import numpy as np
import pandas as pd
import shapely.geometry

import src.utils


In [2]:
# Check file hash, then import PosM file. Keep the POSTCODE column as string.
POSM_FILE = "./data/Poscode - Lat Long.xlsx"
assert src.utils.check_hash(POSM_FILE)
df_import = pd.read_excel(POSM_FILE, sheet_name="Sheet2", dtype={"POSTCODE": str})


In [3]:
df = df_import.rename(
    columns={
        "LOCATION": "location",
        "POSTCODE": "postcode",
        "POST_OFFICE": "district_0",
        "POST_OFFICE_1": "district_1",
        "STATE": "state",
        "DATEUPDATE": "last_updated",
        "POINT_X": "lon",
        "POINT_Y": "lat",
    }
)
display(df.head())
display(df.info())


Unnamed: 0,location,postcode,district_0,district_1,state,last_updated,lon,lat
0,Pusat Komersial & Perindustrian Gangsa Jaya,76100,Melaka,Melaka,Melaka,2020-10-02 00:00:00.0000000,102.264939,2.214094
1,Rimbun Kiara,70200,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
2,Rimbun Impian,70300,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
3,Rimbun Jasmin,70300,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934
4,Nusari Bayu 2,71950,Seremban,Seremban,Negeri Sembilan,2020-10-02 00:00:00.0000000,101.941512,2.71934


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71031 entries, 0 to 71030
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   location      71031 non-null  object 
 1   postcode      71031 non-null  object 
 2   district_0    71031 non-null  object 
 3   district_1    71031 non-null  object 
 4   state         71031 non-null  object 
 5   last_updated  71031 non-null  object 
 6   lon           71031 non-null  float64
 7   lat           71031 non-null  float64
dtypes: float64(2), object(6)
memory usage: 4.3+ MB


None

 # Cleanup

In [4]:
# White space and capitalisation.
for col in ["location", "postcode", "district_0", "district_1", "state"]:
    df[col] = df[col].apply(lambda x: x.strip().removesuffix("_x000D_"))
    df[col] = df[col].apply(lambda x: x[0].upper() + x[1:])

# We can drop location and date_updated, as it's not very useful.
# Note that we also reordered lat and lon columns.
df = df[["postcode", "district_0", "district_1", "state", "lat", "lon"]]
df = df.drop_duplicates()

# Clean state column.
CLEANED_STATES = ["Johor", "Kedah", "Kelantan", "Melaka", "Negeri Sembilan", "Pahang", 
    "Perak", "Perlis", "Pulau Pinang", "Sabah", "Sarawak", "Selangor", "Terengganu",
    "WP Kuala Lumpur", "WP Labuan", "WP Putrajaya"]

def unclean_states():
    return sorted([s for s in df.state.unique() if s not in CLEANED_STATES])

print("Unclean states:", unclean_states())

clean = {"WP kuala Lumpur": "WP Kuala Lumpur"}
df["state"] = df["state"].apply(lambda x: clean[x] if x in clean.keys() else x)
assert unclean_states() == []
print("Unclean states:", unclean_states())

# Why are there two district fields? After cleaning, they are the same.
df_districts = df.loc[:, ["district_0", "district_1"]].drop_duplicates()
for col in df_districts:
    print(f"{col} has {len(df_districts[col].unique())} unique values.")
assert len(df_districts.district_0.unique()) == len(df_districts.district_0.unique())

# We can drop district_0 and rename district_1 to district.
df = df.drop(columns="district_0").rename(columns={"district_1": "district"})
display(df.head())

# Add PPV regions: KL, Selangor & Putrajaya are the same region
df["ppv_region"] = df["state"].apply(
    lambda x: "Selangor/KL/Putrajaya"
    if x in ["Selangor", "WP Kuala Lumpur", "WP Putrajaya"]
    else x
)


Unclean states: ['WP kuala Lumpur']
Unclean states: []
district_0 has 437 unique values.
district_1 has 437 unique values.


Unnamed: 0,postcode,district,state,lat,lon
0,76100,Melaka,Melaka,2.214094,102.264939
1,70200,Seremban,Negeri Sembilan,2.71934,101.941512
2,70300,Seremban,Negeri Sembilan,2.71934,101.941512
4,71950,Seremban,Negeri Sembilan,2.71934,101.941512
5,43000,Kajang,Selangor,2.99319,101.787


In [5]:
# Unique data per column
for c in df.columns:
    print(c, len(df[c].unique()))


postcode 2886
district 437
state 16
lat 419
lon 419
ppv_region 14


# Investigation #1: First two digits of postcodes

The first two digits of a postcode are *not* unique to a state.

In [6]:
df["postcode_2digits"] = df["postcode"].apply(lambda x: x[:2])
temp = df[["postcode_2digits", "state"]].drop_duplicates()

# Note that some of these "two-digit postcodes" span multiple states.
temp = temp["postcode_2digits"].value_counts()
temp = temp[temp > 1]

print("These 2-digit postcodes span multiple states:")
display(temp)
print("-----")

# Retrieve all full postcodes with this problem
bad_postcodes = temp.index.to_list()
bad_rows_idx = df[df["postcode_2digits"].isin(bad_postcodes)].index.to_list()
df_bad = df.loc[bad_rows_idx].sort_values("postcode")
print(f"There are {len(df_bad)} bad postcodes.")
display(df_bad)
df = df.drop(columns="postcode_2digits")

# TODO: What do we do with these "bad" postcodes?

These 2-digit postcodes span multiple states:


34    2
14    2
Name: postcode_2digits, dtype: int64

-----
There are 37 bad postcodes.


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,postcode_2digits
668,14000,Bukit Mertajam,Pulau Pinang,5.365022,100.459264,Pulau Pinang,14
1455,14007,Bukit Mertajam,Pulau Pinang,5.365022,100.459264,Pulau Pinang,14
1456,14009,Bukit Mertajam,Pulau Pinang,5.365022,100.459264,Pulau Pinang,14
1457,14020,Bukit Mertajam,Pulau Pinang,5.365022,100.459264,Pulau Pinang,14
600,14100,Simpang Ampat,Pulau Pinang,5.283955,100.476943,Pulau Pinang,14
1913,14110,Simpang Ampat,Pulau Pinang,5.283955,100.476943,Pulau Pinang,14
1921,14120,Simpang Ampat,Pulau Pinang,5.283955,100.476943,Pulau Pinang,14
601,14200,Sungai Jawi,Pulau Pinang,5.215871,100.497033,Pulau Pinang,14
649,14300,Nibong Tebal,Pulau Pinang,5.167157,100.475425,Pulau Pinang,14
1805,14310,Nibong Tebal,Pulau Pinang,5.167157,100.475425,Pulau Pinang,14


For example, it seems that 14xxx postcodes are all in Pulau Pinang... except for 14390 which is in Kedah.

Similarly, 34xxx postcodes are all in Perak except 34950 which is in Kedah.

# Investigation #2: Districts having multiple coordinates

For most of the file, every postcode in the same district has the same coordinates. Not great (we would prefer more accurate postcode locations), but good enough.

However, there are some districts that share the same coordinates. These problematic districts are those that have the same name but in different states.

In [7]:
# Show only the districts that have the same name across multiple states.
gb = df[["district", "state"]].drop_duplicates().groupby("district").count()  # gb = groupby
gb = gb[gb["state"] > 1]  # Only those with multiples
display(gb)

# Store this list of districts.
bad_districts = gb.index.to_list()

# Helper function to filter out bad districts only.
def bad_districts_only(df):
    mask = df["district"].isin(bad_districts)
    df = df[mask].sort_values(["district", "state", "postcode"])
    return df

display(bad_districts_only(df))


Unnamed: 0_level_0,state
district,Unnamed: 1_level_1
Ayer Hitam,2
Hulu Langat,2
Jeram,2
Kepala Batas,2
Rantau Panjang,2
Serdang,2
Simpang Ampat,2


Unnamed: 0,postcode,district,state,lat,lon,ppv_region
6029,86100,Ayer Hitam,Johor,6.238259,100.245701,Johor
19789,86107,Ayer Hitam,Johor,6.238259,100.245701,Johor
6891,6150,Ayer Hitam,Kedah,6.238259,100.245701,Kedah
254,43100,Hulu Langat,Selangor,3.100379,101.802997,Selangor/KL/Putrajaya
58624,53100,Hulu Langat,WP Kuala Lumpur,3.100379,101.802997,Selangor/KL/Putrajaya
56852,31850,Jeram,Perak,3.26099,101.304808,Perak
6140,45800,Jeram,Selangor,3.26099,101.304808,Selangor/KL/Putrajaya
6153,6200,Kepala Batas,Kedah,5.610519,100.444822,Kedah
7573,6207,Kepala Batas,Kedah,5.610519,100.444822,Kedah
7574,6209,Kepala Batas,Kedah,5.610519,100.444822,Kedah


We have a file with hand-corrected coordinates for these bad postcodes. Let's merge them into our main dataframe.

In [8]:
FIX1_FILE = "./data/bad_postcode_data_Thev.xlsx"
assert src.utils.check_hash(FIX1_FILE)
df_import2 = pd.read_excel(
    FIX1_FILE,
    sheet_name="Sheet1",
    converters={"postcode": src.utils.postcode_str},
)
df_fix = df_import2[["postcode", "state", "lat", "lon"]].set_index("postcode")
fixed_postcodes = df_fix.index.to_list()
bdst_postcodes = fixed_postcodes[:]  # store for later

print("Fixed districts file:")
display(df_fix)

# Merge in the fixed coordinates from df_fix.
def fix_coords(series):
    pc = series["postcode"]
    if pc in fixed_postcodes:
        return df_fix.loc[pc][["lat", "lon"]]
    else:
        return series[["lat", "lon"]]

df[["lat_fix", "lon_fix"]] = df.apply(fix_coords, axis="columns")
print("-----")
print("Merged df with fixed coordinates:")
display(bad_districts_only(df))

# Overwrite lat and lon.
df[["lat", "lon"]] = df[["lat_fix", "lon_fix"]]
df = df.drop(columns=["lat_fix", "lon_fix"])
print("-----")
print("Corrected df:")
display(df)


Fixed districts file:


Unnamed: 0_level_0,state,lat,lon
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2700,Perlis,6.342472,100.189822
2707,Perlis,6.342472,100.189822
2709,Perlis,6.342472,100.189822
2800,Perlis,6.342472,100.189822
14100,Pulau Pinang,5.283955,100.476943
14110,Pulau Pinang,5.283955,100.476943
14120,Pulau Pinang,5.283955,100.476943
6200,Kedah,6.216755,100.399978
6207,Kedah,6.216755,100.399978
6209,Kedah,6.216755,100.399978


-----
Merged df with fixed coordinates:


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,lat_fix,lon_fix
6029,86100,Ayer Hitam,Johor,6.238259,100.245701,Johor,1.956969,103.20613
19789,86107,Ayer Hitam,Johor,6.238259,100.245701,Johor,1.956969,103.20613
6891,6150,Ayer Hitam,Kedah,6.238259,100.245701,Kedah,6.238259,100.245701
254,43100,Hulu Langat,Selangor,3.100379,101.802997,Selangor/KL/Putrajaya,3.100379,101.802997
58624,53100,Hulu Langat,WP Kuala Lumpur,3.100379,101.802997,Selangor/KL/Putrajaya,3.260238,101.733399
56852,31850,Jeram,Perak,3.26099,101.304808,Perak,4.393723,101.192217
6140,45800,Jeram,Selangor,3.26099,101.304808,Selangor/KL/Putrajaya,3.26099,101.304808
6153,6200,Kepala Batas,Kedah,5.610519,100.444822,Kedah,6.216755,100.399978
7573,6207,Kepala Batas,Kedah,5.610519,100.444822,Kedah,6.216755,100.399978
7574,6209,Kepala Batas,Kedah,5.610519,100.444822,Kedah,6.216755,100.399978


-----
Corrected df:


Unnamed: 0,postcode,district,state,lat,lon,ppv_region
0,76100,Melaka,Melaka,2.214094,102.264939,Melaka
1,70200,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan
2,70300,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan
4,71950,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan
5,43000,Kajang,Selangor,2.993190,101.787000,Selangor/KL/Putrajaya
...,...,...,...,...,...,...
70583,88875,Kota Kinabalu,Sabah,5.982556,116.074506,Sabah
70584,89707,Bongawan,Sabah,5.527186,115.857611,Sabah
70585,91128,Lahad Datu,Sabah,5.024247,118.330774,Sabah
70588,32100,TLDM Lumut,Perak,4.211214,100.641323,Perak


# Investigation #3: Coordinates inside state boundaries

We perform a sanity check that all coordinates are inside state boundaries, as defined by a GeoJson file.

In [9]:
with open("./data/json_1_states.json") as fp:
    states_gj = json.load(fp)  # gj = geojson

for feature in states_gj["features"]:
    feature["polygon"] = shapely.geometry.shape(feature["geometry"])

# Find which polygon contains the given coordinates, return None otherwise.
@lru_cache
def reverse_geocode(lon, lat):
    point = shapely.geometry.Point(lon, lat)  # lon/lat
    for feature in states_gj["features"]:
        if feature["polygon"].contains(point):
            return (feature["properties"])["state"].title()
    return "None"


df["state_gj"] = (
    df[["lat", "lon"]]
    .apply(lambda x: reverse_geocode(x["lon"], x["lat"]), axis="columns")
)  # reversed lat/lon!

print("States in PosM file:")
display(df["state"].drop_duplicates())
print("-----")
print("States in GeoJson file:")
display(df["state_gj"].drop_duplicates())



States in PosM file:


0                 Melaka
1        Negeri Sembilan
5               Selangor
8                 Pahang
17                 Johor
43              Kelantan
56                 Perak
81       WP Kuala Lumpur
252         WP Putrajaya
446         Pulau Pinang
764              Sarawak
1134          Terengganu
2038               Kedah
2513              Perlis
9705               Sabah
44175          WP Labuan
Name: state, dtype: object

-----
States in GeoJson file:


0                   Melaka
1                   Negeri
5                 Selangor
8                   Pahang
17                   Johor
43                Kelantan
56                   Perak
81       W.P. Kuala Lumpur
252         W.P. Putrajaya
446           Pulau Pinang
764                Sarawak
1134            Terengganu
2038                 Kedah
2513                Perlis
9705                 Sabah
44175          W.P. Labuan
Name: state_gj, dtype: object

In [10]:
# Clean the geojson state names:
ok_states = df.state.drop_duplicates().to_list()
gj_to_posm = {
    "Negeri": "Negeri Sembilan",
    "W.P. Kuala Lumpur": "WP Kuala Lumpur",
    "W.P. Putrajaya": "WP Putrajaya",
    "W.P. Labuan": "WP Labuan",
}

df["state_gj"] = df["state_gj"].apply(lambda x: x if x in ok_states else gj_to_posm[x])
display(df)


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,state_gj
0,76100,Melaka,Melaka,2.214094,102.264939,Melaka,Melaka
1,70200,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,Negeri Sembilan
2,70300,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,Negeri Sembilan
4,71950,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,Negeri Sembilan
5,43000,Kajang,Selangor,2.993190,101.787000,Selangor/KL/Putrajaya,Selangor
...,...,...,...,...,...,...,...
70583,88875,Kota Kinabalu,Sabah,5.982556,116.074506,Sabah,Sabah
70584,89707,Bongawan,Sabah,5.527186,115.857611,Sabah,Sabah
70585,91128,Lahad Datu,Sabah,5.024247,118.330774,Sabah,Sabah
70588,32100,TLDM Lumut,Perak,4.211214,100.641323,Perak,Perak


In [11]:
# oob = out of boundaries
df_oob = df[df["state"] != df["state_gj"]].sort_values(["lat", "lon", "postcode"])
display(df_oob)


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,state_gj
62401,47810,Petaling jaya,Selangor,3.063544,101.694744,Selangor/KL/Putrajaya,WP Kuala Lumpur
1687,46000,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
928,46050,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
992,46100,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
1689,46150,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
...,...,...,...,...,...,...,...
173,43200,Cheras,Selangor,3.098030,101.736950,Selangor/KL/Putrajaya,WP Kuala Lumpur
45711,43207,Cheras,Selangor,3.098030,101.736950,Selangor/KL/Putrajaya,WP Kuala Lumpur
448,53100,Kuala Lumpur,WP Kuala Lumpur,3.260238,101.733399,Selangor/KL/Putrajaya,Selangor
58624,53100,Hulu Langat,WP Kuala Lumpur,3.260238,101.733399,Selangor/KL/Putrajaya,Selangor


In [12]:
oob_postcodes = df_oob["postcode"].to_list()

FIX2_FILE = "./data/posm_coords_not_in_state_geojson.csv"
assert src.utils.check_hash(FIX2_FILE)
df_import3 = pd.read_csv( 
    FIX2_FILE,
    usecols=["postcode", "lat", "lon"],
    dtype={"lat": float, "lon": float},
    converters={"postcode": src.utils.postcode_str},
)
df_fix = df_import3.set_index("postcode")
fixed_postcodes = df_fix.index.to_list()

print("Fixed postcodes file:")
display(df_fix)

df[["lat_fix", "lon_fix"]] = df.apply(fix_coords, axis="columns")
print("-----")
print("Merged df with fixed coordinates:")
display(df[df["postcode"].isin(oob_postcodes)])

# Overwrite lat and lon.
df[["lat", "lon"]] = df[["lat_fix", "lon_fix"]]
df = df.drop(columns=["lat_fix", "lon_fix"])
print("-----")
print("Corrected df:")
display(df[df["postcode"].isin(oob_postcodes)])

del df["state_gj"]


Fixed postcodes file:


Unnamed: 0_level_0,lat,lon
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
47301,3.086439,101.649091
46200,3.086439,101.649091
43200,3.048914,101.768898
47810,3.086439,101.649091
53100,3.260238,101.733399
...,...,...
46796,3.110464,101.626332
46797,3.110464,101.626332
46798,3.110464,101.626332
46799,3.110464,101.626332


-----
Merged df with fixed coordinates:


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,state_gj,lat_fix,lon_fix
127,47301,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.086439,101.649091
129,46200,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.086439,101.649091
173,43200,Cheras,Selangor,3.098030,101.736950,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.048914,101.768898
387,47810,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.086439,101.649091
448,53100,Kuala Lumpur,WP Kuala Lumpur,3.260238,101.733399,Selangor/KL/Putrajaya,Selangor,3.260238,101.733399
...,...,...,...,...,...,...,...,...,...
63196,46797,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.110464,101.626332
63197,46798,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.110464,101.626332
63198,46799,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.110464,101.626332
63199,46800,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur,3.110464,101.626332


-----
Corrected df:


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,state_gj
127,47301,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
129,46200,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
173,43200,Cheras,Selangor,3.048914,101.768898,Selangor/KL/Putrajaya,WP Kuala Lumpur
387,47810,Petaling Jaya,Selangor,3.086439,101.649091,Selangor/KL/Putrajaya,WP Kuala Lumpur
448,53100,Kuala Lumpur,WP Kuala Lumpur,3.260238,101.733399,Selangor/KL/Putrajaya,Selangor
...,...,...,...,...,...,...,...
63196,46797,Petaling Jaya,Selangor,3.110464,101.626332,Selangor/KL/Putrajaya,WP Kuala Lumpur
63197,46798,Petaling Jaya,Selangor,3.110464,101.626332,Selangor/KL/Putrajaya,WP Kuala Lumpur
63198,46799,Petaling Jaya,Selangor,3.110464,101.626332,Selangor/KL/Putrajaya,WP Kuala Lumpur
63199,46800,Petaling Jaya,Selangor,3.110464,101.626332,Selangor/KL/Putrajaya,WP Kuala Lumpur


 # Master postcode

We map every postcode to its "master postcode": a similar postcode with the same lat_long coordinates, but with a lower level. If there are multiple candidates for the master postcode, pick the smaller postcode (when treated as an integer).

Example: if 54000, 54100, 54120, 60000, 61000, 61900, 61990 and 70000 all share the same coordinates, we map all of them to the same "master postcode": 60000 (a 1st-level postcode).

In [13]:
# Helper function
def postcode_level(postcode: str) -> int:
    """Returns the postcode level: number of digits before the ending string of zeroes.

    e.g. 54000 -> Level 2 postcode
         54680 -> Level 4 postcode
    """
    return len(postcode.rstrip("0"))


 Unfortunately, there are some postcodes that each have multiple coordinates...

In [14]:
# Find all postcodes which have multiple coordinates
temp = df[["postcode", "lat", "lon"]].groupby("postcode").count()
temp["count"] = temp.apply(max, axis="columns")
temp = temp.drop(columns=["lat", "lon"])

# Filter only the ones with multiple coordinates
temp = temp[temp["count"] > 1].sort_values(by="count", ascending=False)
display(temp)
print("-----")
print("Example: postcode 21400")
display(df[df["postcode"] == "21400"])


Unnamed: 0_level_0,count
postcode,Unnamed: 1_level_1
21400,3
6010,2
57000,2
81300,2
76100,2
75460,2
72120,2
71800,2
59200,2
53100,2


-----
Example: postcode 21400


Unnamed: 0,postcode,district,state,lat,lon,ppv_region
4908,21400,Bukit Payong,Terengganu,5.232199,103.101954,Terengganu
14255,21400,Kuala Terengganu,Terengganu,5.337306,103.13771,Terengganu
61939,21400,Chalok,Terengganu,5.424328,102.837014,Terengganu


 For these postcodes, we take the average coordinate. We are assuming that the postcode
 boundaries are convex, and that the average coordinate is still within the postcode
 boundary.

 This will break the "one district, one coordinate" rule established above, unfortunately.

In [15]:
# Pick the smallest coordinates (tuple comparison always compares latitude first)
df[["lat_mean", "lon_mean"]] = (
    df[["postcode", "lat", "lon"]].groupby("postcode").transform(np.mean)
)
print("Example: postcode 21400")
display(df[df["postcode"] == "21400"])

# If all ok, overwrite the lat and lon columns:
df[["lat", "lon"]] = df[["lat_mean", "lon_mean"]]
df = df.drop(columns=["lat_mean", "lon_mean"])


Example: postcode 21400


Unnamed: 0,postcode,district,state,lat,lon,ppv_region,lat_mean,lon_mean
4908,21400,Bukit Payong,Terengganu,5.232199,103.101954,Terengganu,5.331278,103.025559
14255,21400,Kuala Terengganu,Terengganu,5.337306,103.13771,Terengganu,5.331278,103.025559
61939,21400,Chalok,Terengganu,5.424328,102.837014,Terengganu,5.331278,103.025559


 Now, we create a mapping of all postcodes to their master postcode.

In [16]:
# Create mapping of all postcodes to their master postcode

# Master postcode is the postcode with the most zeroes at the end. Break ties (e.g. 50000, 60000) by picking the smaller number.
def master_postcode_sort(postcode: str):
    return (postcode_level(postcode), int(postcode))

def master_postcode_agg(series):
    postcodes = series.to_list()
    return min(postcodes, key=master_postcode_sort)

df["master_postcode"] = (
    df[["postcode", "lat", "lon"]]
    .groupby(["lat", "lon"])
    .transform(master_postcode_agg)
)
display(df.head())



Unnamed: 0,postcode,district,state,lat,lon,ppv_region,master_postcode
0,76100,Melaka,Melaka,2.262406,102.272489,Melaka,76100
1,70200,Seremban,Negeri Sembilan,2.71934,101.941512,Negeri Sembilan,70000
2,70300,Seremban,Negeri Sembilan,2.71934,101.941512,Negeri Sembilan,70000
4,71950,Seremban,Negeri Sembilan,2.71934,101.941512,Negeri Sembilan,70000
5,43000,Kajang,Selangor,2.99319,101.787,Selangor/KL/Putrajaya,43000


In [17]:
df_check = df.copy()
# View it all together
df_check[["master_district", "master_state", "master_lat", "master_lon"]] = (
    df_check["master_postcode"].
    apply(lambda x: df[df.loc[:, "postcode"] == x].iloc[0].loc[["district", "state", "lat", "lon"]])
)
df_check["postcode_to_master_distance"] = (
    df_check.apply(
        lambda x: src.utils.haversine_km(x["lat"], x["lon"], x["master_lat"], x["master_lon"]),
        axis="columns"
    )
)
df_check["district_match"] = df_check.apply(lambda x: x["district"] == x["master_district"], axis="columns")
df_check["state_match"] = df_check.apply(lambda x: x["state"] == x["master_state"], axis="columns")
df_check["bad_district_fix?"] = df_check["postcode"].apply(lambda x: x in bdst_postcodes)
df_check["state_boundary_fix?"] = df_check["postcode"].apply(lambda x: x in oob_postcodes)
display(df_check)

Unnamed: 0,postcode,district,state,lat,lon,ppv_region,master_postcode,master_district,master_state,master_lat,master_lon,postcode_to_master_distance,district_match,state_match,bad_district_fix?,state_boundary_fix?
0,76100,Melaka,Melaka,2.262406,102.272489,Melaka,76100,Melaka,Melaka,2.262406,102.272489,0.0,True,True,False,False
1,70200,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,70000,Seremban,Negeri Sembilan,2.719340,101.941512,0.0,True,True,False,False
2,70300,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,70000,Seremban,Negeri Sembilan,2.719340,101.941512,0.0,True,True,False,False
4,71950,Seremban,Negeri Sembilan,2.719340,101.941512,Negeri Sembilan,70000,Seremban,Negeri Sembilan,2.719340,101.941512,0.0,True,True,False,False
5,43000,Kajang,Selangor,2.993190,101.787000,Selangor/KL/Putrajaya,43000,Kajang,Selangor,2.993190,101.787000,0.0,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70583,88875,Kota Kinabalu,Sabah,5.982556,116.074506,Sabah,88000,Kota Kinabalu,Sabah,5.982556,116.074506,0.0,True,True,False,False
70584,89707,Bongawan,Sabah,5.527186,115.857611,Sabah,89700,Bongawan,Sabah,5.527186,115.857611,0.0,True,True,False,False
70585,91128,Lahad Datu,Sabah,5.024247,118.330774,Sabah,91100,Lahad Datu,Sabah,5.024247,118.330774,0.0,True,True,False,False
70588,32100,TLDM Lumut,Perak,4.211214,100.641323,Perak,32100,TLDM Lumut,Perak,4.211214,100.641323,0.0,True,True,False,False


# Export files

In [18]:
NAME = "posm-postcodes"
# Final, clean postcode table.
df_pc = df[["postcode", "master_postcode", "state", "ppv_region", "lat", "lon"]].drop_duplicates()
df_pc = df_pc.sort_values(["state", "postcode"])
display(df_pc)

df_check.to_excel(f"./output/{NAME}-check.xlsx")
df_pc.to_excel(f"./output/{NAME}-full.xlsx")
df_pc.to_pickle(f"./output/{NAME}-full.pkl")

Unnamed: 0,postcode,master_postcode,state,ppv_region,lat,lon
708,79000,79000,Johor,Johor,1.425136,103.61443
58864,79050,79000,Johor,Johor,1.425136,103.61443
80,79100,79000,Johor,Johor,1.425136,103.61443
427,79150,79000,Johor,Johor,1.425136,103.61443
118,79200,79000,Johor,Johor,1.425136,103.61443
...,...,...,...,...,...,...
66223,62676,62000,WP Putrajaya,Selangor/KL/Putrajaya,2.937230,101.69482
66224,62677,62000,WP Putrajaya,Selangor/KL/Putrajaya,2.937230,101.69482
66225,62686,62000,WP Putrajaya,Selangor/KL/Putrajaya,2.937230,101.69482
6725,62690,62000,WP Putrajaya,Selangor/KL/Putrajaya,2.937230,101.69482


In [19]:
df_export = df_pc[["state", "postcode", "master_postcode"]]
display(df_export)
df_export.to_csv(f"./output/{NAME}-core.csv", sep="|", index=False)

# TODO: perhaps, export ppv_region instead of state?

Unnamed: 0,state,postcode,master_postcode
708,Johor,79000,79000
58864,Johor,79050,79000
80,Johor,79100,79000
427,Johor,79150,79000
118,Johor,79200,79000
...,...,...,...
66223,WP Putrajaya,62676,62000
66224,WP Putrajaya,62677,62000
66225,WP Putrajaya,62686,62000
6725,WP Putrajaya,62690,62000
