In [1]:
import pandas as pd
import numpy as np
from src.utils import check_hash, postcode_str

In [2]:
POSM = "./data/Poscode - Lat Long.xlsx"
POSTCODEMY = "./data/dep_poskod_geo_web.csv"
assert check_hash(POSM)
assert check_hash(POSTCODEMY)

df_posm_import = pd.read_excel(POSM, sheet_name="Sheet2", converters={"POSTCODE": postcode_str}) # Pos Malaysia data
df_postcodemy_import = pd.read_csv(POSTCODEMY, converters={"postcode": postcode_str}) # postcode.my data

print(df_posm_import.info())
print(df_postcodemy_import.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71031 entries, 0 to 71030
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LOCATION       71031 non-null  object 
 1   POSTCODE       71031 non-null  object 
 2   POST_OFFICE    71031 non-null  object 
 3   POST_OFFICE_1  71031 non-null  object 
 4   STATE          71031 non-null  object 
 5   DATEUPDATE     71031 non-null  object 
 6   POINT_X        71031 non-null  float64
 7   POINT_Y        71031 non-null  float64
dtypes: float64(2), object(6)
memory usage: 4.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   postcode     2778 non-null   object 
 1   state_given  2778 non-null   object 
 2   lat          2778 non-null   float64
 3   lon          2778 non-null   float64
dtypes: float64(2), object(2)
memory 

In [3]:
# Only the important columns.
df_posm = df_posm_import["POSTCODE"].rename("postcode").drop_duplicates()
df_postcodemy = df_postcodemy_import["postcode"].drop_duplicates()

print("Pos Malaysia data:")
display(df_posm.head())
print(df_posm.shape)
print("-----")
print("postcode.my data:")
display(df_postcodemy.head())
print(df_postcodemy.shape)


Pos Malaysia data:


0    76100
1    70200
2    70300
4    71950
5    43000
Name: postcode, dtype: object

(2886,)
-----
postcode.my data:


0    82100
1    86100
2    81920
3    81930
4    81440
Name: postcode, dtype: object

(2778,)


In [4]:
not_in_posm = df_postcodemy[~df_postcodemy.isin(df_posm)].sort_values()
print(not_in_posm)


# Don't worry about these.
not_in_postcodemy = df_posm[~df_posm.isin(df_postcodemy)].sort_values()
print(not_in_postcodemy)

230     06660
243     08330
295     08340
269     09090
281     09410
        ...  
1424    88610
1434    88628
1442    88658
1443    88660
1444    88661
Name: postcode, Length: 88, dtype: object
7348     01542
7349     05646
7350     06309
7351     08107
7352     08109
         ...  
20338    96510
20339    96609
20340    96857
20341    97307
20342    98207
Name: postcode, Length: 196, dtype: object


In [5]:
results = []
for pc in not_in_posm:
    similar_mask = df_posm.apply(lambda x: x[:2] == pc[:2]) # Same first 2 digits (inefficient, but small dataset...)
    similar = df_posm[similar_mask].to_list()
    if len(similar) == 0:
        raise ValueError(f"{pc} has no Pos Malaysia postcode with the same first 2 digits!")
    nearest = min(similar, key=lambda x: abs(int(x[2:]) - int(pc[2:])))
    results.append((pc, nearest))
df = pd.DataFrame(results, columns=["postcode.my", "closest_posm"])
display(df)

# TODO: check if these coordinates are close enough