In [1]:
import pandas as pd
import numpy as np
import src.utils

# Pos Malaysia data vs. postcode.my data

We have two separate data files containing postcode data; one from Pos Malaysia and one scraped from postcode.my. They don't map cleanly, so let's investigate.

In [2]:
# Import files, check the hash to ensure no changes have been made.
POSM = "./data/Poscode - Lat Long.xlsx"
POSTCODEMY = "./data/dep_poskod_geo_web.csv"
assert src.utils.check_hash(POSM)
assert src.utils.check_hash(POSTCODEMY)

df_posm_import = pd.read_excel(
    POSM, 
    sheet_name="Sheet2", 
    converters={"POSTCODE": src.utils.postcode_str}
) # Pos Malaysia data
df_postcodemy_import = pd.read_csv(
    POSTCODEMY, 
    converters={"postcode": src.utils.postcode_str}
) # postcode.my data

print("PosM data:")
print(df_posm_import.info())
print("-----")
print("postcode.my data:")
print(df_postcodemy_import.info())

PosM data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71031 entries, 0 to 71030
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LOCATION       71031 non-null  object 
 1   POSTCODE       71031 non-null  object 
 2   POST_OFFICE    71031 non-null  object 
 3   POST_OFFICE_1  71031 non-null  object 
 4   STATE          71031 non-null  object 
 5   DATEUPDATE     71031 non-null  object 
 6   POINT_X        71031 non-null  float64
 7   POINT_Y        71031 non-null  float64
dtypes: float64(2), object(6)
memory usage: 4.3+ MB
None
-----
postcode.my data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   postcode     2778 non-null   object 
 1   state_given  2778 non-null   object 
 2   lat          2778 non-null   float64
 3   lon          2778 non-null   float64
dt

In [3]:
# Keep only the columns which are important to us.
df_posm = df_posm_import["POSTCODE"].rename("postcode").drop_duplicates()
df_postcodemy = df_postcodemy_import["postcode"].drop_duplicates()

print("Pos Malaysia data:")
display(df_posm.head())
print(df_posm.shape)
print("-----")
print("postcode.my data:")
display(df_postcodemy.head())
print(df_postcodemy.shape)


Pos Malaysia data:


0    76100
1    70200
2    70300
4    71950
5    43000
Name: postcode, dtype: object

(2886,)
-----
postcode.my data:


0    82100
1    86100
2    81920
3    81930
4    81440
Name: postcode, dtype: object

(2778,)


Here are the:

- **Postcodes in postcode.my, which are *not* in PosM** - we will be using this.
- **Postcodes in PosM, which are *not* in postcode.my** - for informational purposes only.

In [4]:
not_in_posm = df_postcodemy[~df_postcodemy.isin(df_posm)].sort_values()
not_in_postcodemy = df_posm[~df_posm.isin(df_postcodemy)].sort_values()

print("In postcode.my, NOT in PosM:")
print(not_in_posm)
print("-----")
print("In PosM, NOT in postcode.my:")
print(not_in_postcodemy)

In postcode.my, NOT in PosM:
230     06660
243     08330
295     08340
269     09090
281     09410
        ...  
1424    88610
1434    88628
1442    88658
1443    88660
1444    88661
Name: postcode, Length: 88, dtype: object
-----
In PosM, NOT in postcode.my:
7348     01542
7349     05646
7350     06309
7351     08107
7352     08109
         ...  
20338    96510
20339    96609
20340    96857
20341    97307
20342    98207
Name: postcode, Length: 196, dtype: object


# Closest valid PosM postcode

Let's call each postcode.my postcode which isn't in PosM an **invalid** postcode. We want to match it to the closest **valid** postcode in the PosM dataset.

We'll use a simple heuristic: for each postcode, let's call its first two digits the **region**. E.g. 50480 --> 50, 07893 --> 07

For each invalid postcode, find the set of all valid postcodes with the same region. Then, pick the valid postcode whose last 3 digits is closest to the invalid postcode's last 3 digits, when treated as an integer.

Example: Invalid postcode - **31900**. Valid postcodes with region 31 - **{31000, 31800, 31890, 31920}**. Then the closest valid postcode is **31890** (difference of 10).

In [5]:
# Run the heuristic.
results = []
for pc in not_in_posm:
    similar_mask = df_posm.apply(lambda x: x[:2] == pc[:2]) # Same first 2 digits (inefficient, but small dataset...)
    similar = df_posm[similar_mask].to_list()
    if len(similar) == 0:
        raise ValueError(f"{pc} has no Pos Malaysia postcode with the same first 2 digits!")
    nearest = min(similar, key=lambda x: abs(int(x[2:]) - int(pc[2:])))
    results.append((pc, nearest))
df = pd.DataFrame(results, columns=["postcode.my", "closest_posm"])
display(df)

# TODO: check if these coordinates are close enough

Unnamed: 0,postcode.my,closest_posm
0,06660,06650
1,08330,08320
2,08340,08320
3,09090,09100
4,09410,09409
...,...,...
83,88610,88609
84,88628,88626
85,88658,88656
86,88660,88662


# Export files

In [6]:
df.to_csv("./output/postcode.my-to-posm.csv", sep="|", index=False)