## Project: Named Entity Recognition model <br>
Version: 0.1 <br>
Author: Lakshitha Wisumperuma

### Requirments <br>
Install the dependancies using requirements.txt <br>
Important: 
1. Spacy should be version 2.3.5

In [None]:
!pip3 install -r requirements.txt

### Load Packages

In [1]:
from __future__ import unicode_literals, print_function
import pickle
import random
from pathlib import Path
import plac
import spacy
from spacy.util import minibatch, compounding

print(spacy.__version__)

2.3.5


Following is optional, train using GPU <br>
Requirments: <br>
    1. GPU install <br>
    2. CUDA installed
    

In [2]:
!nvcc --version
spacy.require_gpu()

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


True

### Entity labels

In [3]:
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""

'\ngeo = Geographical Entity\norg = Organization\nper = Person\ngpe = Geopolitical Entity\ntim = Time indicator\nart = Artifact\neve = Event\nnat = Natural Phenomenon\n'

### Loading training data 

In [4]:
with open ('spacy_dataset', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

### Model training fuction

In [5]:
def train_model(model, new_model_name, output_dir, n_iter):
    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        

### Traning the Model

In [6]:
train_model("en_core_web_sm", 'new_model_v2', 'final/', 100)

Loaded model 'en_core_web_sm'
Losses {'ner': 542846.9937061241}
Losses {'ner': 531319.1385076634}
Losses {'ner': 527744.917006569}
Losses {'ner': 524900.4176091421}
Losses {'ner': 525904.7823297309}
Losses {'ner': 524448.7311715771}
Losses {'ner': 523816.8188313992}
Losses {'ner': 523186.12224400166}
Losses {'ner': 522327.68912530446}
Losses {'ner': 521809.3583123947}
Losses {'ner': 524359.8282843948}
Losses {'ner': 522506.66997830383}
Losses {'ner': 523422.0864084009}
Losses {'ner': 521380.46632373077}
Losses {'ner': 519699.3646395472}
Losses {'ner': 520189.2869832292}
Losses {'ner': 521134.4424651233}
Losses {'ner': 519937.94597516075}
Losses {'ner': 521019.4313840747}
Losses {'ner': 520372.42350722663}
Losses {'ner': 522172.22954303224}
Losses {'ner': 521409.2417482352}
Losses {'ner': 518247.3741325028}
Losses {'ner': 519127.6138641173}
Losses {'ner': 520724.25288296887}
Losses {'ner': 520943.07210891915}
Losses {'ner': 519780.2917672277}
Losses {'ner': 519182.7035299925}
Losses {'n

### Test the Model

In [12]:
test_text = 'Lucky is from Sri Lanka'
output_dir = 'final/'

print("Loading from", output_dir)

nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)

Loading from final/
B-per Lucky
B-geo Sri
I-geo Lanka


Refer: https://spacy.io/usage/training#tips