In [1]:
!pip install pyahocorasick
!pip install spacy==3.0.*



In [4]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

# from pandarallel import pandarallel
# pandarallel.initialize()

import spacy
from spacy.training import Example
import random

import ahocorasick

In [6]:

df = pd.read_csv("./Address Elements Extraction Dataset/train.csv")
df.set_index("id", inplace=True)
df['POI'] = np.nan
df['street'] = np.nan

def extract_entities(row):
    extracted = row['POI/street'].split("/")
    
    if len(extracted) == 2:
        poi, street = extracted
        if poi.strip() != '':
            row['POI'] = poi
        
        if street.strip() != '':
            row['street'] = street
        
    return row

df = df.apply(extract_entities, axis=1)
nlp = spacy.blank('id')  # create blank Language class

In [7]:
df.head()

Unnamed: 0_level_0,raw_address,POI/street,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,"aye, jati sampurna",/,,
2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,"toko dita, kertosono",toko dita/,toko dita,
4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [66]:
from copy import deepcopy

def _build_aho(words):
    aho = ahocorasick.Automaton()
    for idx, key in enumerate(words):
        
        aho.add_word(key, (idx, key))

    return aho

def format_data(text, poi, street):
    print(poi)
    print(street)
    print(text, type(street), type(poi))
    entities = []
    _text = deepcopy(text)
    
    if isinstance(poi, str):
        aho = _build_aho([poi])
        aho.make_automaton()
        latest_char_idx = 0
        
        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            print(word, end, len(word), len(_text), start)
            
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'POI'))
            _text = _text.replace(word, " " * len(word))
            latest_char_idx = end + 1
    print("==================================================", latest_char_idx, _text)
    if isinstance(street, str):
        aho = _build_aho([street])
        aho.make_automaton()
        latest_char_idx = 0

        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            print(word, end, len(word), len(_text), start)
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'STREET'))
            latest_char_idx = end + 1
    te = nlp.make_doc(text)
    print(entities)
    return Example.from_dict(nlp.make_doc(text), {"entities": entities})

row = df.loc[5]
example = format_data(row['raw_address'], row['POI'], row['street'])
print(example)

toko bb kids
raya samb gede
raya samb gede, 299 toko bb kids <class 'str'> <class 'str'>
toko bb kids 31 12 32 20
raya samb gede 13 14 32 0
[(20, 32, 'POI'), (0, 14, 'STREET')]
{'doc_annotation': {'cats': {}, 'entities': ['B-STREET', 'I-STREET', 'L-STREET', 'O', 'O', 'B-POI', 'I-POI', 'L-POI'], 'links': {}}, 'token_annotation': {'ORTH': ['raya', 'samb', 'gede', ',', '299', 'toko', 'bb', 'kids'], 'SPACY': [True, True, False, True, True, True, True, False], 'TAG': ['', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7], 'DEP': ['', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0]}}


In [67]:
print("Preparing Spacy examples...")

# examples = []
# for idx in df.index:
#     try:
#         row = df.loc[idx]
#         print(row['raw_address'], "|| ", row['POI'], "|| ", row['street'])
#         example = format_data(row['raw_address'], row['POI'], row['street'])
#         examples.append(example)
#         assert(0)
#     except Exception as e:
#         print(idx)
#         print("-" * 50)
#         print(e)
#         break
print(nlp.pipe_names)

Preparing Spacy examples...
['ner']


In [11]:
def train_spacy(nlp, examples, iterations):
    TRAIN_DATA = examples
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(examples)
            losses = {}
            for example in examples:
                nlp.update(
                    [example],
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [12]:
train = examples[:1000]

In [13]:
ner_nlp = train_spacy(nlp, train, 20)

Starting iteration 0
{'ner': 1709.6888900375807}
Starting iteration 1
{'ner': 1280.3647955898307}
Starting iteration 2
{'ner': 989.7089022598497}
Starting iteration 3
{'ner': 796.4730590667048}
Starting iteration 4
{'ner': 632.1378597545147}
Starting iteration 5
{'ner': 538.6968047420994}
Starting iteration 6
{'ner': 466.7558808627223}
Starting iteration 7
{'ner': 393.4218261669775}
Starting iteration 8
{'ner': 342.5487180961452}
Starting iteration 9
{'ner': 284.929109557395}
Starting iteration 10
{'ner': 332.9121753852147}
Starting iteration 11
{'ner': 268.88026969935265}
Starting iteration 12
{'ner': 233.48483350481223}
Starting iteration 13
{'ner': 213.0652872944358}
Starting iteration 14
{'ner': 220.90497312166806}
Starting iteration 15
{'ner': 162.32278404978697}
Starting iteration 16
{'ner': 192.0866079604451}
Starting iteration 17
{'ner': 177.09782033061475}
Starting iteration 18
{'ner': 172.8689141152765}
Starting iteration 19
{'ner': 150.817549991746}


In [14]:
for idx, row in df.iloc[100:110].iterrows():
    print(f"address: {row['raw_address']}")
    print(f"expected poi: {row['POI']}")
    print(f"expected street: {row['street']}")
    print()
    
    doc = ner_nlp(row['raw_address'])
    for ent in doc.ents:
        print(ent.text, "-", ent.label_)

    print("-" * 50)

address: kedai tenun jep senn, kota bumi, kebon melati
expected poi: kedai tenun jepara sennaart
expected street: kota bumi

kota bumi - STREET
--------------------------------------------------
address: wadungasri dalam waru raya wad asri, 24 sidoarjo
expected poi: dalam waru
expected street: raya wad asri

dalam waru - POI
raya wad asri - STREET
--------------------------------------------------
address: bulusan tim barat iii, no 35 3 tembalang
expected poi: nan
expected street: tim barat iii

tim barat iii - STREET
--------------------------------------------------
address: bakti jaya bukit perm vii 8 15315 setu
expected poi: nan
expected street: bukit perm vii

bukit perm vii - STREET
--------------------------------------------------
address: jl terusan buah batu no 185. samping indomaret. bandung.
expected poi: samping indomaret
expected street: jl terusan buah batu

jl terusan buah batu - STREET
samping indomaret - POI
--------------------------------------------------
address: 

In [15]:
df_test = pd.read_csv("./Address Elements Extraction Dataset/test.csv")
df_test.set_index("id", inplace=True)

In [16]:
submission = []
for idx, row in df_test.iloc[:].iterrows():
    doc = ner_nlp(row['raw_address'])
    tmp = {'id': idx}
    for ent in doc.ents:
        tmp[ent.label_] = ent.text
    submission.append(tmp)

In [17]:
pd.DataFrame(submission)

Unnamed: 0,id,STREET,POI
0,0,s. par,
1,1,angg per,
2,2,mand imog,
3,3,raya nga sri wedari,
4,4,cut,
...,...,...,...
49995,49995,,toko mbak farid semboro semboro
49996,49996,vete,
49997,49997,bakar malabar,
49998,49998,,graha indah


In [18]:
submission = pd.DataFrame(submission)
submission['POI/street'] = submission['POI'] + '/' + submission['STREET']

In [19]:
pd.DataFrame({'id':submission['id'],'POI/street':submission['POI/street']}).to_csv('./submission.csv', header=True, index=False)

In [20]:
submission.head()

Unnamed: 0,id,STREET,POI,POI/street
0,0,s. par,,
1,1,angg per,,
2,2,mand imog,,
3,3,raya nga sri wedari,,
4,4,cut,,
