In [2]:
import spacy
import pickle
import random
import warnings

from tqdm import tqdm
from spacy.util import minibatch, compounding
from spacy.training import Example
from pathlib import Path

In [3]:
with open('pickle/spacy_train.pickle','rb') as file:
    train_data = pickle.load(file)

In [4]:
train_data

[('jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat',
  {'entities': [(0, 40, 'STREET')]}),
 ('aye, jati sampurna', {'entities': []}),
 ('setu siung 119 rt 5 1 13880 cipayung', {'entities': [(5, 10, 'STREET')]}),
 ('toko dita, kertosono', {'entities': [(0, 9, 'POI')]}),
 ('jl. orde baru', {'entities': [(0, 13, 'STREET')]}),
 ('raya samb gede, 299 toko bb kids',
  {'entities': [(20, 32, 'POI'), (0, 14, 'STREET')]}),
 ('kem mel raya, no 4 bojong rawalumbu rt 1 36 rawalumbu',
  {'entities': [(0, 12, 'STREET')]}),
 ('tela keuramat kuta alam', {'entities': [(0, 4, 'STREET')]}),
 ('gg. i wates magersari', {'entities': [(0, 5, 'STREET')]}),
 ('bunga ncole ix 2', {'entities': [(0, 14, 'STREET')]}),
 ('cikahuripan sd neg boj 02 klap boj, no 5 16877',
  {'entities': [(26, 34, 'STREET')]}),
 ('yaya atohar,', {'entities': []}),
 ('abim ix 24 5', {'entities': [(0, 7, 'STREET')]}),
 ('gang xiii rungkut', {'entities': [(0, 9, 'STREET')]}),
 ('kamp utan jaya, 23 rt 3 rw 8 16433 beji'

In [3]:
with open('pickle/spacy_train_final.pickle','rb') as file:
    train_data_final = pickle.load(file)

In [4]:
train_data_final

[('jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat',
  {'entities': [(0, 40, 'STREET')]}),
 ('aye, jati sampurna', {'entities': []}),
 ('setu siung 119 rt 5 1 13880 cipayung', {'entities': [(5, 10, 'STREET')]}),
 ('toko dita, kertosono', {'entities': [(0, 9, 'POI')]}),
 ('jl. orde baru', {'entities': [(0, 13, 'STREET')]}),
 ('raya samb gede, 299 toko bb kids',
  {'entities': [(20, 32, 'POI'), (0, 14, 'STREET')]}),
 ('kem mel raya, no 4 bojong rawalumbu rt 1 36 rawalumbu',
  {'entities': [(0, 12, 'STREET')]}),
 ('tela keuramat kuta alam', {'entities': [(0, 4, 'STREET')]}),
 ('gg. i wates magersari', {'entities': [(0, 5, 'STREET')]}),
 ('bunga ncole ix 2', {'entities': [(0, 14, 'STREET')]}),
 ('cikahuripan sd neg boj 02 klap boj, no 5 16877',
  {'entities': [(26, 34, 'STREET')]}),
 ('yaya atohar,', {'entities': []}),
 ('abim ix 24 5', {'entities': [(0, 7, 'STREET')]}),
 ('gang xiii rungkut', {'entities': [(0, 9, 'STREET')]}),
 ('kamp utan jaya, 23 rt 3 rw 8 16433 beji'

## Model 1 ( with 'xx_ent_wiki_sm' ) multi-language model

In [9]:
nlp = spacy.load('xx_ent_wiki_sm')

In [14]:
tokens = nlp("I work at Google, Facebook and Twitter")

In [15]:
for ent in tokens.ents:
    print(ent.text)

Google
Facebook
Twitter


In [61]:
nlp.pipe_names

['ner']

In [62]:
ner = nlp.get_pipe('ner')

In [63]:
# add POI and street label to ner
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [74]:
optimizer = nlp.resume_training()
examples = []
for text, annots in tqdm(train_data):
    try:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))
        warnings.filterwarnings("ignore")
    except:
        continue
    
nlp.initialize(lambda: examples)    
for i in tqdm(range(50)):
    random.shuffle(examples)
    losses = {}

    for batch in minibatch(examples, size=128):
        nlp.update(batch, losses = losses, sgd = optimizer)
        #print('Losses: ', losses)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [00:34<00:00, 8774.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 50/50 [10:33:24<00:00, 760.08s/it]


In [83]:
tokens = nlp('banjarn scoo club, letjen supr, kutabanjarnegara banjarnegara')
for ent in tokens.ents:
    print(ent.text, ent.label_)

letjen supr STREET


In [76]:
nlp.to_disk('./id_ner_address')

## Model 2 ( with 'en_core_web_lg' ) large english model

In [20]:
nlp2 = spacy.load('en_core_web_lg')

In [21]:
tokens2 = nlp2("I work at Google, Facebook and Twitter")

In [22]:
for ent in tokens2.ents:
    print(ent.text)

Google
Facebook
Twitter


In [23]:
nlp2.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [24]:
ner2 = nlp2.get_pipe('ner')

In [25]:
# add POI and street label to ner2
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner2.add_label(ent[2])

In [26]:
disable_pipes = [pipe for pipe in nlp2.pipe_names if pipe != 'ner']

In [29]:
with nlp2.disable_pipes(*disable_pipes):
    optimizer = nlp2.resume_training()
    examples = []
    for text, annots in tqdm(train_data):
        try:
            examples.append(Example.from_dict(nlp2.make_doc(text), annots))
            warnings.filterwarnings("ignore")
        except:
            continue

    #nlp2.initialize(lambda: examples)    
    for i in tqdm(range(50)):
        random.shuffle(examples)
        losses = {}

        for batch in minibatch(examples, size=128):
            nlp2.update(batch, losses = losses, sgd = optimizer)
            #print('Losses: ', losses)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [00:36<00:00, 8194.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [7:34:37<00:00, 545.55s/it]


In [30]:
tokens = nlp2('banjarn scoo club, letjen supr, kutabanjarnegara banjarnegara')
for ent in tokens.ents:
    print(ent.text, ent.label_)

letjen supr STREET


In [31]:
nlp2.to_disk('./id_ner_address_model_2_from_en')

## Model 3 ( with 'xx_ent_wiki_sm' ) but without nlp.initialize()

In [32]:
nlp3 = spacy.load('xx_ent_wiki_sm')

In [33]:
tokens3 = nlp3("I work at Google, Facebook and Twitter")

In [34]:
for ent in tokens3.ents:
    print(ent.text)

Google
Facebook
Twitter


In [35]:
nlp3.pipe_names

['ner']

In [36]:
ner3 = nlp3.get_pipe('ner')

In [37]:
# add POI and street label to ner
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner3.add_label(ent[2])

In [38]:
optimizer = nlp3.resume_training()
examples = []
for text, annots in tqdm(train_data):
    try:
        examples.append(Example.from_dict(nlp3.make_doc(text), annots))
        warnings.filterwarnings("ignore")
    except:
        continue
    
#nlp.initialize(lambda: examples)    
for i in tqdm(range(50)):
    random.shuffle(examples)
    losses = {}

    for batch in minibatch(examples, size=128):
        nlp3.update(batch, losses = losses, sgd = optimizer)
        #print('Losses: ', losses)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [00:45<00:00, 6633.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [7:44:30<00:00, 557.41s/it]


In [39]:
tokens = nlp3('banjarn scoo club, letjen supr, kutabanjarnegara banjarnegara')
for ent in tokens.ents:
    print(ent.text, ent.label_)

letjen supr STREET


In [40]:
nlp3.to_disk('./id_ner_address_model_3_multi_no_init')

## Final Model ( with 'xx_ent_wiki_sm' ) multi-language model 
Same structure with model 1 cuz model 1 has the best performance

In [5]:
nlp4 = spacy.load('xx_ent_wiki_sm')

In [6]:
tokens = nlp4("I work at Google, Facebook and Twitter")

In [7]:
for ent in tokens.ents:
    print(ent.text)

Google
Facebook
Twitter


In [8]:
nlp4.pipe_names

['ner']

In [9]:
ner = nlp4.get_pipe('ner')

In [10]:
# add POI and street label to ner
for _, annotations in train_data_final:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [11]:
optimizer = nlp4.resume_training()
examples = []
for text, annots in tqdm(train_data_final):
    try:
        examples.append(Example.from_dict(nlp4.make_doc(text), annots))
        warnings.filterwarnings("ignore")
    except:
        continue
    
nlp4.initialize(lambda: examples)    
for i in tqdm(range(120)):
    random.shuffle(examples)
    losses = {}

    for batch in minibatch(examples, size=128):
        nlp4.update(batch, losses = losses, sgd = optimizer)
        #print('Losses: ', losses)
    
    if i+1 >= 40 and i+1 % 20 == 0:
        nlp4.to_disk(f'./id_ner_address_final_{i+1}')

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [00:46<00:00, 6485.27it/s]
100%|████████████████████████████████████████████████████████████████████████████| 120/120 [16:51:22<00:00, 505.69s/it]


In [12]:
tokens = nlp4('banjarn scoo club, letjen supr, kutabanjarnegara banjarnegara')
for ent in tokens.ents:
    print(ent.text, ent.label_)

letjen supr STREET


In [13]:
nlp4.to_disk('./id_ner_address_final')

## Continue training for final model with different batch size
(end up with worse result than the previous model)

In [14]:
nlp5 = spacy.load('id_ner_address_final')

In [15]:
tokens = nlp5("I work at Google, Facebook and Twitter")

In [16]:
for ent in tokens.ents:
    print(ent.text)

I work at Google


In [19]:
nlp5.pipe_names

['ner']

In [20]:
ner = nlp5.get_pipe('ner')

In [21]:
# add POI and street label to ner
for _, annotations in train_data_final:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [23]:
optimizer = nlp5.resume_training()
examples = []
for text, annots in tqdm(train_data_final):
    try:
        examples.append(Example.from_dict(nlp5.make_doc(text), annots))
        warnings.filterwarnings("ignore")
    except:
        continue
    
nlp5.initialize(lambda: examples)    
for i in tqdm(range(25)):
    random.shuffle(examples)
    losses = {}

    for batch in minibatch(examples, size=32):
        nlp5.update(batch, losses = losses, sgd = optimizer)
        #print('Losses: ', losses)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [00:39<00:00, 7594.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 25/25 [7:50:29<00:00, 1129.17s/it]


In [24]:
tokens = nlp5('banjarn scoo club, letjen supr, kutabanjarnegara banjarnegara')
for ent in tokens.ents:
    print(ent.text, ent.label_)

letjen supr STREET


In [25]:
nlp5.to_disk('./id_ner_address_final_2')