In [1]:
!pip install pyahocorasick
!pip install spacy==3.0.*

Collecting pyahocorasick
  Downloading pyahocorasick-1.4.1.tar.gz (321 kB)
Building wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py): started
  Building wheel for pyahocorasick (setup.py): finished with status 'done'
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.1-cp36-cp36m-win_amd64.whl size=38083 sha256=973be02debeddad60a11313273f2cd18d905d983a86945159573813715f5d06e
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\a5\cf\c2\c79ce865644045e393f55296c0a2e7127dc06d620bc139c15c
Successfully built pyahocorasick
Installing collected packages: pyahocorasick
Successfully installed pyahocorasick-1.4.1
Collecting spacy==3.0.*
  Downloading spacy-3.0.5-cp36-cp36m-win_amd64.whl (11.6 MB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp36-cp36m-win_amd64.whl (35 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp36-cp3

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

# from pandarallel import pandarallel
# pandarallel.initialize()

import spacy
from spacy.training import Example
import random

import ahocorasick
spacy.prefer_gpu()

True

In [6]:
df = pd.read_csv("./Address Elements Extraction Dataset/train.csv")
df.set_index("id", inplace=True)
df['POI'] = np.nan
df['street'] = np.nan

def extract_entities(row):
    extracted = row['POI/street'].split("/")
    
    if len(extracted) == 2:
        poi, street = extracted
        if poi.strip() != '':
            row['POI'] = poi
        
        if street.strip() != '':
            row['street'] = street
        
    return row

df = df.apply(extract_entities, axis=1)
nlp = spacy.blank('id')  # create blank Language class

In [7]:
df.head()

Unnamed: 0_level_0,raw_address,POI/street,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,"aye, jati sampurna",/,,
2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,"toko dita, kertosono",toko dita/,toko dita,
4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [8]:
from copy import deepcopy

def _build_aho(words):
    aho = ahocorasick.Automaton()
    for idx, key in enumerate(words):
        
        aho.add_word(key, (idx, key))

    return aho

def format_data(text, poi, street):
    entities = []
    _text = deepcopy(text)
    
    if isinstance(poi, str):
        aho = _build_aho([poi])
        aho.make_automaton()
        latest_char_idx = 0
        
        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'POI'))
            _text = _text.replace(word, " " * len(word))
            latest_char_idx = end + 1
    if isinstance(street, str):
        aho = _build_aho([street])
        aho.make_automaton()
        latest_char_idx = 0

        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'STREET'))
            latest_char_idx = end + 1
    te = nlp.make_doc(text)
    return Example.from_dict(nlp.make_doc(text), {"entities": entities}), 

# row = df.loc[5]
# example = format_data(row['raw_address'], row['POI'], row['street'])
# print(example)

In [9]:
print("Preparing Spacy examples...")

examples = []
for idx in df.index:
    try:
        row = df.loc[idx]
        example = format_data(row['raw_address'], row['POI'], row['street'])
        examples.append(example)
    except Exception as e:
        print(idx)
        print("-" * 50)
        print(e)
        break


Preparing Spacy examples...


In [10]:
print(len(examples))
df_test = pd.read_csv("./Address Elements Extraction Dataset/test.csv")
df_test.set_index("id", inplace=True)

300000


In [11]:
def saveTemp(i, test_df, ner_nlp):
    print("==================   saveTemp")
    submission = []
    for idx, row in test_df.iloc[:].iterrows():
        doc = ner_nlp(row['raw_address'])
        tmp = {'id': idx}
        for ent in doc.ents:
            tmp[ent.label_] = ent.text
        submission.append(tmp)
    submission = pd.DataFrame(submission)
    submission = submission.fillna("")
    combine_lambda = lambda x: '{}/{}'.format(x['POI'], x['STREET'])
    submission["POI/street"] = submission.apply(combine_lambda, axis = 1)
    
    pd.DataFrame({'id':submission['id'],
                  'POI/street':submission['POI/street']}).to_csv('./submission_all_{}.csv'.format(i), header=True, index=False)

In [19]:
def train_spacy(nlp, examples, iterations):
    TRAIN_DATA = examples
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
    min_losses = np.inf
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            
            random.shuffle(examples)
            losses = {}
#             for example in examples:
#                 nlp.update(
#                     [example],
#                     drop=0.2,  # dropout - make it harder to memorise data
#                     sgd=optimizer,  # callable to update weights
#                     losses=losses)

            for batch in spacy.util.minibatch(examples, size=5):
                nlp.update(batch,
                            drop = 0.2,  # dropout - make it harder to memorise data
                            sgd = optimizer,  # callable to update weights
                            losses = losses)
            print(losses,itn%20 == 0)
            if losses['ner'] < min_losses:
                min_losses = losses['ner']
                if itn%20 == 0 and itn != 0 :
                    nlp.to_disk("./street_pipeline")
                    saveTemp(itn, df_test, nlp)
                
    return nlp


300000


In [20]:
spacy.require_gpu()
print(len(examples))

300000


In [21]:
ner_nlp = train_spacy(nlp, examples[:5000], 25)

Starting iteration 0
{'ner': 6250.291559555151} True
Starting iteration 1
{'ner': 4514.416197725128} False
Starting iteration 2
{'ner': 3728.4837143832306} False
Starting iteration 3
{'ner': 3141.779219829178} False
Starting iteration 4
{'ner': 2617.1965455801746} False
Starting iteration 5
{'ner': 2359.660586907646} False
Starting iteration 6
{'ner': 2054.9478107195196} False
Starting iteration 7
{'ner': 1866.8040904196143} False
Starting iteration 8
{'ner': 1579.3953989492352} False
Starting iteration 9
{'ner': 1487.9122692405915} False
Starting iteration 10
{'ner': 1390.5660515148347} False
Starting iteration 11
{'ner': 1257.9426918019456} False
Starting iteration 12
{'ner': 1175.8923320120423} False
Starting iteration 13
{'ner': 1145.9400623274498} False
Starting iteration 14
{'ner': 1078.4527189640362} False
Starting iteration 15
{'ner': 942.2840581953725} False
Starting iteration 16
{'ner': 939.6170621359341} False
Starting iteration 17
{'ner': 845.7042424177283} False
Starting i

In [27]:
ner_nlp = train_spacy(ner_nlp, examples[:10000], 25)

Starting iteration 0
{'ner': 11311.390522945101} True
Starting iteration 1
{'ner': 7955.781866206249} False
Starting iteration 2
{'ner': 6568.394899257277} False
Starting iteration 3
{'ner': 5815.826171497893} False
Starting iteration 4
{'ner': 5065.116900858079} False
Starting iteration 5
{'ner': 4641.345061616835} False
Starting iteration 6
{'ner': 4130.603815155804} False
Starting iteration 7
{'ner': 3740.7190968296577} False
Starting iteration 8
{'ner': 3515.7429153719613} False
Starting iteration 9
{'ner': 3128.314397619845} False
Starting iteration 10
{'ner': 3074.436420949807} False
Starting iteration 11
{'ner': 2787.7388601123766} False
Starting iteration 12
{'ner': 2718.793744782713} False
Starting iteration 13
{'ner': 2516.7036806742685} False
Starting iteration 14
{'ner': 2402.640504459179} False
Starting iteration 15
{'ner': 2213.78560536515} False
Starting iteration 16
{'ner': 2186.213508641598} False
Starting iteration 17
{'ner': 2089.1769942182777} False
Starting iterati

In [28]:
for idx, row in df.iloc[28000:28010].iterrows():
    print(f"address: {row['raw_address']}")
    print(f"expected poi: {row['POI']}")
    print(f"expected street: {row['street']}")
    print()
    
    doc = ner_nlp(row['raw_address'])
    for ent in doc.ents:
        print(ent.text, "-", ent.label_)

    print("-" * 50)

address: orahili badalu fukagambo 22862
expected poi: nan
expected street: fukagambo

orahili badalu - POI
--------------------------------------------------
address: mand, raya gilim gilimanuk
expected poi: mandapin
expected street: raya gilim

raya gilim - STREET
--------------------------------------------------
address: raya indu, no 62 indo kimia, cikarang selatan
expected poi: indojaya kimia
expected street: raya indu

raya indu - STREET
--------------------------------------------------
address: pegad jend besar ah nasu,
expected poi: nan
expected street: nan

pegad - POI
jend besar ah nasu - STREET
--------------------------------------------------
address: kar jawa pelai pelaihari
expected poi: nan
expected street: kar jawa pelai

kar jawa pelai - STREET
--------------------------------------------------
address: tb. mekar maju, surya kenc selabatu cikole
expected poi: tb. mekar maju
expected street: surya kenc

surya kenc - STREET
---------------------------------------------

In [29]:
submission = []
for idx, row in df_test.iloc[:].iterrows():
    doc = ner_nlp(row['raw_address'])
    tmp = {'id': idx}
    for ent in doc.ents:
        tmp[ent.label_] = ent.text
    submission.append(tmp)
pd.DataFrame(submission)

Unnamed: 0,id,STREET,POI
0,0,s. par,
1,1,angg per,
2,2,mand imog,
3,3,raya nga sri,ud agung rej
4,4,cut mutia,
...,...,...,...
49995,49995,,toko mbak farid
49996,49996,vete 3 cari,tk. ridho kids
49997,49997,nasio,
49998,49998,jl. mujair raya,graha indah


In [38]:
submission = pd.DataFrame(submission)
# submission['POI/street'] = submission['POI'] + '/' + submission['STREET']
submission = submission.fillna("")
combine_lambda = lambda x: '{}/{}'.format(x['POI'], x['STREET'])
submission["POI/street"] = submission.apply(combine_lambda, axis = 1)

In [39]:
submission.head()

Unnamed: 0,id,STREET,POI,POI/street
0,0,s. par,,/s. par
1,1,angg per,,/angg per
2,2,mand imog,,/mand imog
3,3,raya nga sri,ud agung rej,ud agung rej/raya nga sri
4,4,cut mutia,,/cut mutia


In [40]:
pd.DataFrame({'id':submission['id'],
              'POI/street':submission['POI/street']}).to_csv('./submission_test.csv', header=True, index=False)