In [14]:
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm

In [15]:
raw_df = pd.read_csv('data/train.csv')
raw_test = pd.read_csv('data/test.csv')

In [4]:
df = raw_df.copy()
test_df = raw_test.copy()

## Model 1 ( with 'xx_ent_wiki_sm' ) multi language model

In [30]:
test_df

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...


In [9]:
nlp = spacy.load('id_ner_address')

In [32]:
prediction = []

In [33]:
for row in tqdm(test_df.itertuples(index = False), total = len(test_df)):
    predicted = '/'
    tokens = nlp(row[1])
    for ent in tokens.ents:
        if ent.label_ == 'STREET':
            predicted = predicted + ent.text
        if ent.label_ == 'POI':
            predicted = ent.text + predicted
    prediction.append(predicted)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [02:06<00:00, 396.53it/s]


In [34]:
test_df['prediction'] = prediction

In [35]:
test_df.head(100)

Unnamed: 0,id,raw_address,prediction
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
95,95,20 dese 15 rt 2 3 pegadungan kalideres,/20 dese
96,96,nila 64 bulakamba,/nila 64
97,97,"raya pasar kec,",/raya pasar kec
98,98,"lin ren tembil smk pekantua,",smk pekantua/lin ren tembil


In [36]:
submission = test_df[['id','prediction']]

In [39]:
submission = submission.rename({'prediction':'POI/street'}, axis = 1)

In [42]:
submission.to_csv('submission1.csv', index = False)

## Model 2 ( with 'en_core_web_lg' ) large English model

In [5]:
test_df2 = raw_test.copy()

In [6]:
nlp2 = spacy.load('id_ner_address_model_2_from_en')

In [7]:
prediction = []

In [8]:
for row in tqdm(test_df2.itertuples(index = False), total = len(test_df2)):
    predicted = '/'
    tokens = nlp2(row[1])
    for ent in tokens.ents:
        if ent.label_ == 'STREET':
            predicted = predicted + ent.text
        if ent.label_ == 'POI':
            predicted = ent.text + predicted
    prediction.append(predicted)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [05:22<00:00, 154.94it/s]


In [9]:
test_df2['prediction'] = prediction

In [10]:
test_df2.head(100)

Unnamed: 0,id,raw_address,prediction
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
95,95,20 dese 15 rt 2 3 pegadungan kalideres,/20 dese
96,96,nila 64 bulakamba,/nila
97,97,"raya pasar kec,",/raya pasar kec
98,98,"lin ren tembil smk pekantua,",/


In [11]:
submission2 = test_df2[['id','prediction']]

In [12]:
submission2 = submission2.rename({'prediction':'POI/street'}, axis = 1)

In [13]:
submission2.to_csv('submission2.csv', index = False)

## Model 3 ( with 'xx_ent_wiki_sm' ) multi language without nlp.initialize()

In [17]:
test_df3 = raw_test.copy()

In [14]:
nlp3 = spacy.load('id_ner_address_model_3_multi_no_init')

In [15]:
prediction = []

In [18]:
for row in tqdm(test_df3.itertuples(index = False), total = len(test_df3)):
    predicted = '/'
    tokens = nlp3(row[1])
    for ent in tokens.ents:
        if ent.label_ == 'STREET':
            predicted = predicted + ent.text
        if ent.label_ == 'POI':
            predicted = ent.text + predicted
    prediction.append(predicted)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [02:15<00:00, 369.06it/s]


In [19]:
test_df3['prediction'] = prediction

In [20]:
test_df3.head(100)

Unnamed: 0,id,raw_address,prediction
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
95,95,20 dese 15 rt 2 3 pegadungan kalideres,/20 dese
96,96,nila 64 bulakamba,/nila
97,97,"raya pasar kec,",/raya pasar kec
98,98,"lin ren tembil smk pekantua,",smk pekantua/lin ren tembil


In [21]:
submission3 = test_df3[['id','prediction']]

In [22]:
submission3 = submission3.rename({'prediction':'POI/street'}, axis = 1)

In [23]:
submission3.to_csv('submission3.csv', index = False)

## Results from 3 models
Model 1 (xx_ent_wiki_sm model with nlp.initialize()) **Test Accuracy: 58.60%**  
Model 2 (en_core_web_large model) **Test accuracy: 51.22%**  
Model 3 (xx_ent_wiki_sm model without nlp.initialize()) **Test Accuracy: 57.90%**

### Model 1 perform best, so we will use this in the final model and train longer

## Final Model ( with 'xx_ent_wiki_sm' ) multi language with nlp.initialize() and 120 iterations

In [5]:
test_df4 = raw_test.copy()

In [6]:
nlp4 = spacy.load('id_ner_address_final')

In [7]:
prediction = []

In [8]:
for row in tqdm(test_df4.itertuples(index = False), total = len(test_df4)):
    predicted = '/'
    tokens = nlp4(row[1])
    for ent in tokens.ents:
        if ent.label_ == 'STREET':
            predicted = predicted + ent.text
        if ent.label_ == 'POI':
            predicted = ent.text + predicted
    prediction.append(predicted)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [02:17<00:00, 363.58it/s]


In [9]:
test_df4['prediction'] = prediction

In [10]:
test_df4.head(100)

Unnamed: 0,id,raw_address,prediction
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
95,95,20 dese 15 rt 2 3 pegadungan kalideres,/20 dese
96,96,nila 64 bulakamba,/nila
97,97,"raya pasar kec,",/raya pasar kec
98,98,"lin ren tembil smk pekantua,",smk pekantua/lin ren tembil


In [11]:
submission4 = test_df4[['id','prediction']]

In [12]:
submission4 = submission4.rename({'prediction':'POI/street'}, axis = 1)

In [13]:
submission4.to_csv('submission4.csv', index = False)

## Final Model ( with 'xx_ent_wiki_sm' ) multi language with nlp.initialize() and 120 iterations (+25 iteration)

In [16]:
test_df5 = raw_test.copy()

In [17]:
nlp5 = spacy.load('id_ner_address_final_2')

In [18]:
prediction = []

In [19]:
for row in tqdm(test_df5.itertuples(index = False), total = len(test_df5)):
    predicted = '/'
    tokens = nlp5(row[1])
    for ent in tokens.ents:
        if ent.label_ == 'STREET':
            predicted = predicted + ent.text
        if ent.label_ == 'POI':
            predicted = ent.text + predicted
    prediction.append(predicted)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [02:03<00:00, 404.11it/s]


In [20]:
test_df5['prediction'] = prediction

In [21]:
test_df5.head(100)

Unnamed: 0,id,raw_address,prediction
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
95,95,20 dese 15 rt 2 3 pegadungan kalideres,/20 dese
96,96,nila 64 bulakamba,/nila
97,97,"raya pasar kec,",/raya pasar kec
98,98,"lin ren tembil smk pekantua,",smk pekantua/lin ren tembil


In [22]:
submission5 = test_df5[['id','prediction']]

In [23]:
submission5 = submission5.rename({'prediction':'POI/street'}, axis = 1)

In [24]:
submission5.to_csv('submission5.csv', index = False)