## Project: Named Entity Recognition model <br>
Version: 0.1 <br>
Author: Lakshitha Wisumperuma

### Requirments <br>
Install the dependancies using requirements.txt <br>
Important: 
1. Spacy should be version 2.3.5

In [None]:
!pip3 install -r requirements.txt

### Load Packages

In [1]:
from __future__ import unicode_literals, print_function
import pickle
import random
from pathlib import Path
import plac
import spacy
from spacy.util import minibatch, compounding

print(spacy.__version__)

2.3.5


In [11]:
import wandb
wandb.init(project='spacy-ner')

[34m[1mwandb[0m: wandb version 0.10.23 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Following is optional, train using GPU <br>
Requirments: <br>
    1. GPU install <br>
    2. CUDA installed
    

In [3]:
!nvcc --version
spacy.require_gpu()

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


True

### Entity labels

In [4]:
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""

'\ngeo = Geographical Entity\norg = Organization\nper = Person\ngpe = Geopolitical Entity\ntim = Time indicator\nart = Artifact\neve = Event\nnat = Natural Phenomenon\n'

### Loading training data 

In [5]:
with open ('spacy_dataset', 'rb') as fp:
    NER_TRAIN_TEST_DATASET = pickle.load(fp)

TRAIN_DATA = NER_TRAIN_TEST_DATASET[:38300]
TEST_DATA = NER_TRAIN_TEST_DATASET[38300:]

#Dataset length 47761
#train 38300
#train 9461


### Model training fuction

In [8]:
def train_model(model, new_model_name, output_dir, n_iter):
    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print("epoch : ", itn, "    losses : ", losses)
            wandb.log({'epoch': itn, 'loss': losses})

    wandb.save("model-log.h5")
    wandb.finish()
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        

### Traning the Model

In [10]:
train_model(None, 'new_model_v2', 'final/', 2)

Created blank 'en' model


  proc.begin_training(
  proc.begin_training(


epoch :  0     losses :  {'ner': 70282.28596113896}


Error: You must call wandb.init() before wandb.log()

### Test the Model

In [11]:
test_text = 'Lucky is from Sri Lanka'
output_dir = 'final/'

print("Loading from", output_dir)

nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)

Loading from final/
B-per Lucky
B-per Sri
I-geo Lanka


### Evaluating the model

In [4]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

output_dir = 'output/'
nlp2 = spacy.load(output_dir)

def evaluate(model, examples):
  scorer = Scorer()
  for input_, annot in examples:
    #print(input_)
    doc_gold_text = model.make_doc(input_)
    gold = GoldParse(doc_gold_text, entities=annot['entities'])
    pred_value = model(input_)
    scorer.score(pred_value, gold)
  return scorer.scores

test_result = evaluate(nlp2, TEST_DATA)
print(test_result)

{'uas': 0.0, 'las': 0.0, 'las_per_type': {'compound': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nsubj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'prep': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'det': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'amod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'root': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'mark': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'advmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'advcl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'dobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'aux': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'xcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'prt': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'cc': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'conj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'poss': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nummod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'ccomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'acl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'quantmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'npadvmod': {'p': 0.0, 'r':

Refer: https://spacy.io/usage/training#tips