# Data Collection + Data Processing

### Labelling data fields for raw data from Kaggle

In [1]:
import pandas as pd
import numpy as np
import re
import csv

In [43]:
df = pd.read_csv("./data/raw_data.csv", sep=",", dtype=str)
df = df.dropna(subset=['address']) # drop entries with empty address, if any

In [44]:
# add in street num column
street_num_list = []
for index, row in df.iterrows():
    street_num = re.sub("\D", "", row['street'])
    street_num_list.append(street_num)

In [45]:
df2 = df.assign(street_num = street_num_list)
df2

Unnamed: 0,address,street_name,street,block,street_num
0,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO,ANG MO KIO AVE 4,170,4
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO,ANG MO KIO AVE 4,170,4
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO,ANG MO KIO AVE 4,170,4
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO,ANG MO KIO AVE 4,170,4
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO,ANG MO KIO AVE 4,170,4
...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS,WOODLANDS ST 83,863,83
881920,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS,WOODLANDS ST 83,863,83
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS,WOODLANDS ST 83,863,83
881922,864 WOODLANDS STREET 83 SINGAPORE 730864,WOODLANDS,WOODLANDS ST 83,864,83


In [46]:
# create street name and street number columns
street_num_list = []
street_name_list = []

for index, row in df2.iterrows():
    if pd.isna(row["street_num"]) or row["street_num"]=='':
        street_name = row['street']
        street_num = ''
    else:
        street_num = str(int(row['street_num']))
        street = row['street']
        street_name = street.replace(street_num, "")
        
    street_num_list.append(street_num)
    street_name_list.append(street_name)

In [47]:
df3 = df2.assign(street_num = street_num_list, street_name = street_name_list)
df3

Unnamed: 0,address,street_name,street,block,street_num
0,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4
...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83
881920,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83
881922,864 WOODLANDS STREET 83 SINGAPORE 730864,WOODLANDS ST,WOODLANDS ST 83,864,83


In [48]:
# generate floor and unit numbers for addresses
# create floor_num, unit_num and floor_unit columns

df3['floor_num'] = ""
df3['unit_num'] = ""
df3['floor_unit'] = ""

for index, row in df3.iterrows():
    s = row['address']
    
    randnum1 = int(np.random.randint(1, 30, size=1))
    floor = '{:02}'.format(randnum1)
    
    randnum2 = int(np.random.randint(1, 99, size=1))
    while randnum2 == randnum1:
        randnum2 = int(np.random.randint(1, 99, size=1))
    unit = '{:02}'.format(randnum2)
     
    newAddress = s[:-17] + ' #' + floor + '-' + unit + ' ' + s[-16:]
    
    row['address'] = newAddress
    row['floor_unit'] = '#' + floor + '-' + unit
    row['floor_num'] = str(floor)
    row['unit_num'] = str(unit)

In [49]:
df3

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit
0,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #22-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,22,49,#22-49
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,17,#15-17
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,27,49,#27-49
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,42,#15-42
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,25,34,#25-34
...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,20,80,#20-80
881920,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,29,97,#29-97
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,17,07,#17-07
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS ST,WOODLANDS ST 83,864,83,07,57,#07-57


In [50]:
# create postal code column

postal_code_list = []
for index, row in df3.iterrows():
    res = str(row['address'][-6:])
    if (res.isnumeric()):
        postal_code = res
    else:
        postal_code = ''
    postal_code_list.append(postal_code)


In [51]:
df4 = df3.assign(postal_code = postal_code_list)
df4

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit,postal_code
0,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #22-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,22,49,#22-49,560170
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,17,#15-17,560170
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,27,49,#27-49,560170
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,42,#15-42,560170
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,25,34,#25-34,560170
...,...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,20,80,#20-80,730863
881920,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,29,97,#29-97,730863
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,17,07,#17-07,730863
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS ST,WOODLANDS ST 83,864,83,07,57,#07-57,730864


In [52]:
df4.to_pickle("./data/data_creation/data_1.pkl")
df4.to_csv(r'./data/data_creation/data_1.csv', index=False)

In [53]:
df5 = pd.read_pickle("./data/data_creation/data_1.pkl")  

In [54]:
# create country column

country_db = ['SINGAPORE'] # to add on for future
country_list = []
for index, row in df5.iterrows():
    s = row['address']
    unit_start = str(row['floor_unit']) + ' '
    end = ' ' + str(row['postal_code'])
    
    try:
        res = ''.join(s.split(unit_start)[1].split(end)[0])
        if (res in country_db): # check if country is valid
            country = res
        else:
            country = ''
    except:
        country = ''
    
    country_list.append(country)

In [55]:
df6 = df5.assign(country = country_list)
df6

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit,postal_code,country
0,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #22-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,22,49,#22-49,560170,SINGAPORE
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,17,#15-17,560170,SINGAPORE
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,27,49,#27-49,560170,SINGAPORE
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,42,#15-42,560170,SINGAPORE
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,25,34,#25-34,560170,SINGAPORE
...,...,...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,20,80,#20-80,730863,SINGAPORE
881920,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,29,97,#29-97,730863,SINGAPORE
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,17,07,#17-07,730863,SINGAPORE
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS ST,WOODLANDS ST 83,864,83,07,57,#07-57,730864,SINGAPORE


In [56]:
# add in 'BLOCK' before block number for some addresses 
address_list = []
for index, row in df6.iterrows():
    if (index%10==0):
        if pd.isna(row['address']) or row['address']=='':
            address = ''
        else:
            address = "BLOCK " + row['address']
    else:
        address = row['address']
    
    address_list.append(address)

In [57]:
df7 = df6.assign(address = address_list)
df7

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit,postal_code,country
0,BLOCK 170 ANG MO KIO AVENUE 4 KEBUN BARU LINK ...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,22,49,#22-49,560170,SINGAPORE
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,17,#15-17,560170,SINGAPORE
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,27,49,#27-49,560170,SINGAPORE
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,15,42,#15-42,560170,SINGAPORE
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVE,ANG MO KIO AVE 4,170,4,25,34,#25-34,560170,SINGAPORE
...,...,...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,20,80,#20-80,730863,SINGAPORE
881920,BLOCK 863 WOODLANDS STREET 83 MY WORLD PRESCHO...,WOODLANDS ST,WOODLANDS ST 83,863,83,29,97,#29-97,730863,SINGAPORE
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS ST,WOODLANDS ST 83,863,83,17,07,#17-07,730863,SINGAPORE
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS ST,WOODLANDS ST 83,864,83,07,57,#07-57,730864,SINGAPORE


In [58]:
# no buildings yet
df7.to_pickle("./data/data_creation/data_2.pkl")
df7.to_csv(r'./data/data_creation/data_2.csv', index=False)

In [59]:
df8 = pd.read_pickle("./data/data_creation/data_2.pkl")  

In [60]:
# to standardise streets by replacing short forms with the complete forms (optional)

fields = ['street', 'street_name']

for index, row in df8.iterrows():
    for field in fields:
        s = row[field]
        s1 = s.replace("BLDG", "BUILDING")
        s2 = s1.replace("BLK", "BLOCK")
        s3 = s2.replace("BLVD", "BOLUEVARD")
        s4 = s3.replace("BT", "BUKIT")
        s5 = s4.replace("BKT", "BUKIT")
        s6 = s5.replace(" CL", " CLOSE")
        s7 = s6.replace("CRES", "CRESCENT")
        s8 = s7.replace("CTRL", "CENTRAL")
        s9 = s8.replace("CTR", "CENTRE")
        s10 = s9.replace(" CT", " COURT")
        s11 = s10.replace(" DR", " DRIVE")
        s12 = s11.replace("GDN", "GARDEN")
        s13 = s12.replace("GDNS", "GARDENS")   
        s14 = s13.replace(" HTS", " HEIGHTS")
        s15 = s14.replace("JLN", "JALAN")
        s16 = s15.replace(" LK", " LINK")
        s17 = s16.replace("LOR ", "LORONG ")
        s18 = s17.replace(" MT", " MOUNT")
        s19 = s18.replace(" NTH", " NORTH")
        s20 = s19.replace(" RD", " ROAD")
        s21 = s20.replace(" SQ", " SQUARE")
        s22 = s21.replace(" STH", " SOUTH")
        s23 = s22.replace("ST.", "SAINT")
        s24 = s23.replace(" ST", " STREET")
        s25 = s24.replace("TG", "TANJONG")
        s26 = s25.replace(" UPP ", " UPPER ")
        s27 = s26.replace(" AVE", " AVENUE")
        s28 = s27.replace("C'WEALTH", "COMMONWEALTH")
        s29 = s28.replace(" TER", " TERRACE")
        s30 = s29.replace(" PL", " PLACE")
        s31 = s30.replace("PK", "PARK")
        s32 = s31.replace("KG", "KAMPONG")
        s33 = s32.replace("LOR", "LORONG")
        row[field] = s33
    addr = row['address']
    a1 = addr.replace(" TER ", " TERRACE ")
    a2 = a1.replace(" PL ", " PLACE ")
    a3 = a2.replace(" UPP ", " UPPER ")
    a4 = a3.replace("BLK", "BLOCK")
    row['address'] = a4

In [61]:
df8

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit,postal_code,country
0,BLOCK 170 ANG MO KIO AVENUE 4 KEBUN BARU LINK ...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,22,49,#22-49,560170,SINGAPORE
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,15,17,#15-17,560170,SINGAPORE
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,27,49,#27-49,560170,SINGAPORE
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,15,42,#15-42,560170,SINGAPORE
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,25,34,#25-34,560170,SINGAPORE
...,...,...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,20,80,#20-80,730863,SINGAPORE
881920,BLOCK 863 WOODLANDS STREET 83 MY WORLD PRESCHO...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,29,97,#29-97,730863,SINGAPORE
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,17,07,#17-07,730863,SINGAPORE
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS STREET,WOODLANDS STREET 83,864,83,07,57,#07-57,730864,SINGAPORE


In [62]:
# create building column

building_list = []

for index, row in df8.iterrows():
    s = row['address']
    start = row['street'] + ' '
    end = ' #'
    
    try:
        # use street and floor_unit to retrieve the building names
        res = re.search(r'{}(.*?){}'.format(start, end), s).group(1)
        if '#' in res:
            building = ''
        else:
            building = res.strip()
    except:
        building = ''
    
    building_list.append(building)

In [63]:
df9 = df8.assign(building = building_list)
df9

Unnamed: 0,address,street_name,street,block,street_num,floor_num,unit_num,floor_unit,postal_code,country,building
0,BLOCK 170 ANG MO KIO AVENUE 4 KEBUN BARU LINK ...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,22,49,#22-49,560170,SINGAPORE,KEBUN BARU LINK 1
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,15,17,#15-17,560170,SINGAPORE,KEBUN BARU LINK 1
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,27,49,#27-49,560170,SINGAPORE,KEBUN BARU LINK 1
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,15,42,#15-42,560170,SINGAPORE,KEBUN BARU LINK 1
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,ANG MO KIO AVENUE,ANG MO KIO AVENUE 4,170,4,25,34,#25-34,560170,SINGAPORE,KEBUN BARU LINK 1
...,...,...,...,...,...,...,...,...,...,...,...
881919,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,20,80,#20-80,730863,SINGAPORE,MY WORLD PRESCHOOL LTD.
881920,BLOCK 863 WOODLANDS STREET 83 MY WORLD PRESCHO...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,29,97,#29-97,730863,SINGAPORE,MY WORLD PRESCHOOL LTD.
881921,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,WOODLANDS STREET,WOODLANDS STREET 83,863,83,17,07,#17-07,730863,SINGAPORE,MY WORLD PRESCHOOL LTD.
881922,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,WOODLANDS STREET,WOODLANDS STREET 83,864,83,07,57,#07-57,730864,SINGAPORE,


In [73]:
df9['country_postal'] = df9['country'] + ' ' + df9['postal_code']
# note that empty fields are kept as '' instead of NaN for future data processing. They will be replaced with NaN before being trained.
df10 = df9[['address', 'block', 'street','street_name', 'street_num', 'building', 'floor_unit','floor_num', 'unit_num', 'country', 'postal_code', 'country_postal']].drop_duplicates(subset=['address'], ignore_index=True)
df10

Unnamed: 0,address,block,street,street_name,street_num,building,floor_unit,floor_num,unit_num,country,postal_code,country_postal
0,BLOCK 170 ANG MO KIO AVENUE 4 KEBUN BARU LINK ...,170,ANG MO KIO AVENUE 4,ANG MO KIO AVENUE,4,KEBUN BARU LINK 1,#22-49,22,49,SINGAPORE,560170,SINGAPORE 560170
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,170,ANG MO KIO AVENUE 4,ANG MO KIO AVENUE,4,KEBUN BARU LINK 1,#15-17,15,17,SINGAPORE,560170,SINGAPORE 560170
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,170,ANG MO KIO AVENUE 4,ANG MO KIO AVENUE,4,KEBUN BARU LINK 1,#27-49,27,49,SINGAPORE,560170,SINGAPORE 560170
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,170,ANG MO KIO AVENUE 4,ANG MO KIO AVENUE,4,KEBUN BARU LINK 1,#15-42,15,42,SINGAPORE,560170,SINGAPORE 560170
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,170,ANG MO KIO AVENUE 4,ANG MO KIO AVENUE,4,KEBUN BARU LINK 1,#25-34,25,34,SINGAPORE,560170,SINGAPORE 560170
...,...,...,...,...,...,...,...,...,...,...,...,...
864478,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,863,WOODLANDS STREET 83,WOODLANDS STREET,83,MY WORLD PRESCHOOL LTD.,#20-80,20,80,SINGAPORE,730863,SINGAPORE 730863
864479,BLOCK 863 WOODLANDS STREET 83 MY WORLD PRESCHO...,863,WOODLANDS STREET 83,WOODLANDS STREET,83,MY WORLD PRESCHOOL LTD.,#29-97,29,97,SINGAPORE,730863,SINGAPORE 730863
864480,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,863,WOODLANDS STREET 83,WOODLANDS STREET,83,MY WORLD PRESCHOOL LTD.,#17-07,17,07,SINGAPORE,730863,SINGAPORE 730863
864481,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,864,WOODLANDS STREET 83,WOODLANDS STREET,83,,#07-57,07,57,SINGAPORE,730864,SINGAPORE 730864


In [75]:
df10.to_pickle("./data/data_creation/cleaned_data_w_extended_fields.pkl")
df10.to_csv(r'./data/data_creation/cleaned_data_w_extended_fields.csv', index=False)

In [68]:
df11 = df9[['address', 'block', 'street', 'building', 'floor_num', 'unit_num', 'postal_code', 'country']]
df11 = df11.dropna(subset=['address']).drop_duplicates(subset=['address'], ignore_index=True).replace(r'^\s*$', np.nan, regex=True)
df11

Unnamed: 0,address,block,street,building,floor_num,unit_num,postal_code,country
0,BLOCK 170 ANG MO KIO AVENUE 4 KEBUN BARU LINK ...,170,ANG MO KIO AVENUE 4,KEBUN BARU LINK 1,22,49,560170,SINGAPORE
1,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,170,ANG MO KIO AVENUE 4,KEBUN BARU LINK 1,15,17,560170,SINGAPORE
2,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #27-...,170,ANG MO KIO AVENUE 4,KEBUN BARU LINK 1,27,49,560170,SINGAPORE
3,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #15-...,170,ANG MO KIO AVENUE 4,KEBUN BARU LINK 1,15,42,560170,SINGAPORE
4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 #25-...,170,ANG MO KIO AVENUE 4,KEBUN BARU LINK 1,25,34,560170,SINGAPORE
...,...,...,...,...,...,...,...,...
864478,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,863,WOODLANDS STREET 83,MY WORLD PRESCHOOL LTD.,20,80,730863,SINGAPORE
864479,BLOCK 863 WOODLANDS STREET 83 MY WORLD PRESCHO...,863,WOODLANDS STREET 83,MY WORLD PRESCHOOL LTD.,29,97,730863,SINGAPORE
864480,863 WOODLANDS STREET 83 MY WORLD PRESCHOOL LTD...,863,WOODLANDS STREET 83,MY WORLD PRESCHOOL LTD.,17,07,730863,SINGAPORE
864481,864 WOODLANDS STREET 83 #07-57 SINGAPORE 730864,864,WOODLANDS STREET 83,,07,57,730864,SINGAPORE


In [69]:
df11.to_pickle("./data/data_creation/cleaned_data_w_labels.pkl")
df11.to_csv(r'./data/data_creation/cleaned_data_w_labels.csv', index=False)