In [1]:
import numpy as np
import pandas as pd
import pickle
import re
from tqdm import tqdm

In [2]:
raw_df = pd.read_csv('data/train.csv')

In [3]:
df = raw_df.copy()
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [4]:
# split columns for POI and street
df['POI'] = df['POI/street'].apply(lambda x: x.split('/')[0])
df['street'] = df['POI/street'].apply(lambda x: x.split('/')[1])

In [5]:
df

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru
...,...,...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/,,
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,


In [7]:
df.iloc[5]

id                                            5
raw_address    raya samb gede, 299 toko bb kids
POI/street          toko bb kids/raya samb gede
POI                                toko bb kids
street                           raya samb gede
Name: 5, dtype: object

In [41]:
df.iloc[6230]

id                                                      6230
raw_address    indo alam (da pemb, tegalsawah karawang timur
POI/street                               /indo alam (da pemb
POI                                                         
street                                    indo alam (da pemb
Name: 6230, dtype: object

In [4]:
# example for data format to train on spaCy
train = [ # entities exact start_letter, but +1 at end_letter like range()
    ("jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat", {"entities": [(0,40,"STREET")]}),
    ("raya samb gede, 299 toko bb kids", {"entities": [(20,31,"POI"), (0,14,"STREET")]})
]

## Try using Regular Expression to preprocess and extract (doesn't work)

In [18]:
train_data = []
entities = {"entities": []}
for row in tqdm(df.itertuples(index = False), total = len(df)):
    if row[3] != '': # POI
        try:
            match = re.search(row[3], row[1])
            if match != None:
                entities['entities'].append((match.start(), match.end(), "POI"))
        except:
            pass
    
    if row[4] != '': # street
        try:
            match = re.search(row[4], row[1])
            if match != None:
                entities['entities'].append((match.start(), match.end(), "STREET"))
        except:
            pass
    
    # in case of, poi/street doesn't match the raw_address
    # need to improve this later because this has to fix/fill the missing POI/stress
#     for index, element in enumerate(entities['entities']):
#         if element[0] == -1:
#             entities['entities'].pop(index)
    
#     #in case of, both POI and street are uncomplete
#     for index, element in enumerate(entities['entities']):
#         if element[0] == -1:
#             entities['entities'].pop(index)
    
    train_data.append((row[1], entities))
    entities = {"entities": []}

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:22<00:00, 13136.49it/s]


## A better way to preprocess data to be ready to train on spaCy

In [85]:
# in case of using .find and there is a overlap text, we can use .find and specify start argument to be
# an index of '/', this will do!!!
train_data_final = []
entities = {"entities": []}
for row in tqdm(df.itertuples(index = False), total = len(df)):
    if row[3] != '': # POI
        entities['entities'].append((row[1].find(row[3]), row[1].find(row[3]) + len(row[3]), 'POI'))
    
    # in case of, poi doesn't match the raw_address
    if entities['entities'] != [] and entities['entities'][0][0] == -1:
        entities['entities'].pop()
    
    if row[4] != '': # street
        # in case of overlapping
        if entities['entities'] != [] and ((row[1].find(row[4]) >= entities['entities'][0][0]) and (row[1].find(row[4]) + len(row[4]) <= entities['entities'][0][1])):
            # start find after the last char of POI
            entities['entities'].append((row[1].find(row[4], row[1].find(row[3]) + len(row[3])), 
                                         row[1].find(row[4], row[1].find(row[3]) + len(row[3])) + len(row[4]), 'STREET'))
        else:
            entities['entities'].append((row[1].find(row[4]), row[1].find(row[4]) + len(row[4]), 'STREET'))

    #in case of, both POI and street are uncomplete
#     for index, element in enumerate(entities['entities']):
#         if element[0] == -1:
#             entities['entities'].pop(index)
    
    #in case of, both POI and street are uncomplete
    for index, element in enumerate(entities['entities']):
        if element[0] == -1:
            entities['entities'].pop(index)
    
    train_data_final.append((row[1], entities))
    entities = {"entities": []}

100%|██████████████████████████████████████████████████████████████████████| 300000/300000 [00:02<00:00, 120394.02it/s]


In [86]:
result = pd.DataFrame(train_data)
result_final = pd.DataFrame(train_data_final)

## Compare to make sure the final version is better than the first one

In [47]:
result_final.head(100)

Unnamed: 0,0,1
0,jl kapuk timur delta sili iii lippo cika 11 a ...,"{'entities': [(0, 40, 'STREET')]}"
1,"aye, jati sampurna",{'entities': []}
2,setu siung 119 rt 5 1 13880 cipayung,"{'entities': [(5, 10, 'STREET')]}"
3,"toko dita, kertosono","{'entities': [(0, 9, 'POI')]}"
4,jl. orde baru,"{'entities': [(0, 13, 'STREET')]}"
...,...,...
95,"cau terr, gal,",{'entities': []}
96,taman kota kedaung kali angke gg.h.musanif no ...,"{'entities': [(0, 10, 'POI')]}"
97,aren jaya sumb iv 319 rt 3 10 17111 bekasi timur,"{'entities': [(10, 17, 'STREET')]}"
98,"shi mel, nanggalo","{'entities': [(0, 7, 'STREET')]}"


In [89]:
result[result[1] != result_final[1]]

Unnamed: 0,0,1
983,"knalpot putra mahakam, maha 11 65111 klojen","{'entities': [(0, 21, 'POI'), (14, 18, 'STREET..."
2233,"kintawani car wash, kinta, no c 16","{'entities': [(0, 18, 'POI'), (0, 5, 'STREET')]}"
2921,"angsa, 26 cntrl + p,","{'entities': [(0, 5, 'STREET')]}"
3256,"dr. gema nazri yanni, m.ked(ped), sp.a bunga l...","{'entities': [(39, 48, 'STREET')]}"
4382,"bank rakyat indonesia (persero) tbk. pt, haurw...",{'entities': []}
...,...,...
297759,mekar jaya griya asri utara vii griya asri uta...,"{'entities': [(11, 31, 'POI'), (11, 31, 'STREE..."
298010,"lya & nurul (lyn), m h tham boule, tanah abang","{'entities': [(19, 33, 'STREET')]}"
298366,"dip, 12 natasha diponegoro surabaya, rw 1 wono...","{'entities': [(8, 35, 'POI'), (16, 22, 'STREET..."
298818,"sekolah dasar negeri (sdn) 190, r kemu, ario k...","{'entities': [(32, 38, 'STREET')]}"


In [90]:
result_final[result[1] != result_final[1]]

Unnamed: 0,0,1
983,"knalpot putra mahakam, maha 11 65111 klojen","{'entities': [(0, 21, 'POI'), (23, 27, 'STREET..."
2233,"kintawani car wash, kinta, no c 16","{'entities': [(0, 18, 'POI'), (20, 25, 'STREET..."
2921,"angsa, 26 cntrl + p,","{'entities': [(10, 19, 'POI'), (0, 5, 'STREET')]}"
3256,"dr. gema nazri yanni, m.ked(ped), sp.a bunga l...","{'entities': [(0, 38, 'POI'), (39, 48, 'STREET..."
4382,"bank rakyat indonesia (persero) tbk. pt, haurw...","{'entities': [(0, 39, 'POI')]}"
...,...,...
297759,mekar jaya griya asri utara vii griya asri uta...,"{'entities': [(11, 31, 'POI'), (32, 52, 'STREE..."
298010,"lya & nurul (lyn), m h tham boule, tanah abang","{'entities': [(0, 17, 'POI'), (19, 33, 'STREET..."
298366,"dip, 12 natasha diponegoro surabaya, rw 1 wono...","{'entities': [(8, 35, 'POI')]}"
298818,"sekolah dasar negeri (sdn) 190, r kemu, ario k...","{'entities': [(0, 30, 'POI'), (32, 38, 'STREET..."


In [104]:
result_final.iloc[3091][0], result_final.iloc[3091][1]

('jl. pasar senen dlm. iv no. 37 rt rw 007 04 kelurahan senen kecamatan senen 10410',
 {'entities': [(4, 15, 'POI'), (0, 15, 'STREET')]})

In [105]:
df.iloc[3091]

id                                                          3091
raw_address    jl. pasar senen dlm. iv no. 37 rt rw 007 04 ke...
POI/street                           pasar senen/jl. pasar senen
POI                                                  pasar senen
street                                           jl. pasar senen
Name: 3091, dtype: object

## Save the data

In [64]:
with open('pickle/spacy_train.pickle','wb') as file:
    pickle.dump(train_data, file)

In [103]:
with open('pickle/spacy_train_final.pickle','wb') as file:
    pickle.dump(train_data_final, file)