# Entity training with Doccano and spaCy
18 September 2020

- Train custom NER in spaCy
- Finetune entities in Doccano
- Re-train NER model in spaCy

### Setup

In [1]:
import spacy
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy import displacy
from IPython.display import Image
from IPython.core.display import display, HTML
import random
import json
from sklearn.model_selection import train_test_split
from doccano_transformer.datasets import NERDataset
from doccano_transformer.utils import read_jsonl

In [2]:
spacy.info()

[1m

spaCy version    2.3.2                         
Location         /home/kriesbeck/miniconda3/envs/nlp/lib/python3.7/site-packages/spacy
Platform         Linux-4.19.104-microsoft-standard-x86_64-with-debian-buster-sid
Python version   3.7.7                         
Models                                         



{'spaCy version': '2.3.2',
 'Location': '/home/kriesbeck/miniconda3/envs/nlp/lib/python3.7/site-packages/spacy',
 'Platform': 'Linux-4.19.104-microsoft-standard-x86_64-with-debian-buster-sid',
 'Python version': '3.7.7',
 'Models': ''}

In [3]:
random.seed(0)

In [4]:
def evaluate_model(ner_model, examples):
    """ Evaluates ner_model against goldparse examples

    Args:
        ner_model: spaCy NER model to evaluate
        examples: goldparse text and entity annotations

    Returns:
        Scores, including precision, recall, and f1

    """
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [5]:
def convert_iob_to_spacy(file_path):
    """ Converts data from tab-delimited IOB:
    label \t word \n label \t word \n \n label \t word
    to spaCy format: 
    sentence, {entities : [(start, end, label), (start, end, label)]}

    Args:
        file-path: to IOB data

    Returns:
        Data in spaCy format
        Unique data labels
    """
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[0][2:] # the .txt is formatted: label \t word, label[0:2] = label_type
            label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1) # length of the word + trailing space

            if label_type != 'I' and current_annotation: # if at the end of an annotation
                entities.append((start, end - 2 - len(word), current_annotation)) # append the annotation
                current_annotation = None # reset the annotation
            if label_type == 'B': # if beginning new annotation
                start = end - len(word) - 1 # start annotation at beginning of word
                current_annotation = label # append the word to the current annotation
            if label_type == 'I': # if the annotation is multi-word
                current_annotation = label # append the word

            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)

        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0
            entities, sentence = [], []
            current_annotation = None
    file.close()
    return training_data, unique_labels

In [29]:
def convert_spacy_to_doccano(data=[], model=None):
    """ Runs NER model on data and converts spaCy formatted output to Doccano JSON format

    Args:
        data
        model: spaCy NER model

    Returns:
        Entity data in Doccano JSON format

    """
    FINETUNE_JSON = []
    for text, _ in data:
        doc = model(text)
        doc_json = doc.to_json()
        labels = []
        for ent in doc_json['ents']:
            label = []
            label.append(ent['start'])
            label.append(ent['end'])
            label.append(ent['label'])
            labels.append(label)
        doc_json['labels'] = labels 
        del doc_json['ents']
        del doc_json['tokens']
        FINETUNE_JSON.append(doc_json)
    return FINETUNE_JSON

In [16]:
def train_ner_model(model=None, train_data=[], n_iter=10):
    """ Trains spaCy NER model

    Args:
        train_data: data
        model: 'en_core_web_sm' or model name to update a pretrained model or None to train a new model
        n_iter: number of iterations

    Returns:
        Trained spaCy NER model
    """
    # Load pretrained spaCy model or create a blank model
    if model is not None:
        nlp = spacy.load(model)
    else:
        nlp = spacy.blank("en")

    # Get ner pipeline component (create if necessary)
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe("ner")

    # Add new entity labels to entity recognizer
    for i in LABELS:
        ner.add_label(i)

    # Set optimizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()

    move_names = list(ner.move_names)

    # Get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    # Only train NER pipe
    with nlp.disable_pipes(*other_pipes):
        # Process our training examples in iterations using shuffle, batches, and dropouts
        sizes = compounding(1, 16, 1.001)
        for itn in range(n_iter):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                # For each example, nlp.update steps through the words of the input 
                # At each word, it makes a prediction on the text and checks the annotations 
                # If it was wrong, it adjusts its weights
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            print("Losses", losses)
    return nlp

### Get training data

For this example, we'll use the **MIT Movies** corpus, which contains 10,000 queries about various aspects of movies, with the following entity labels: ACTOR, TITLE, GENRE, DIRECTOR, etc.


The train and test datasets are available here: https://groups.csail.mit.edu/sls/downloads/movie/

They're saved in the data/ directory of this repository, but here are the curl commands for reference:
* curl https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio -o data/test.txt
* curl https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio -o data/train.txt

For other use cases, you could create token or pattern entity matches with spaCy as a starting place.

In [17]:
# Look at the train data:
f = open('data/train.txt', 'r')
train = f.read()
f.close()

print(train[:1000])

O	what
O	movies
O	star
B-ACTOR	bruce
I-ACTOR	willis

O	show
O	me
O	films
O	with
B-ACTOR	drew
I-ACTOR	barrymore
O	from
O	the
B-YEAR	1980s

O	what
O	movies
O	starred
O	both
B-ACTOR	al
I-ACTOR	pacino
O	and
B-ACTOR	robert
I-ACTOR	deniro

O	find
O	me
O	all
O	of
O	the
O	movies
O	that
O	starred
B-ACTOR	harold
I-ACTOR	ramis
O	and
B-ACTOR	bill
I-ACTOR	murray

O	find
O	me
O	a
O	movie
O	with
O	a
O	quote
O	about
O	baseball
O	in
O	it

O	what
O	movies
O	have
B-TITLE	mississippi
O	in
O	the
O	title

O	show
O	me
B-GENRE	science
I-GENRE	fiction
I-GENRE	films
O	directed
O	by
B-DIRECTOR	steven
I-DIRECTOR	spielberg

O	do
O	you
O	have
O	any
B-GENRE	thrillers
O	directed
O	by
B-DIRECTOR	sofia
I-DIRECTOR	coppola

O	what
B-SONG	leonard
I-SONG	cohen
I-SONG	songs
O	have
O	been
O	used
O	in
O	a
O	movie

O	show
O	me
O	films
B-ACTOR	elvis
O	films
B-PLOT	set
I-PLOT	in
I-PLOT	hawaii

O	what
O	movie
O	is
O	references
B-PLOT	zydrate

O	are
O	there
O	any
B-GENRE	musical
I-GENRE	films
O	with
B-ACTOR	patrick
I-ACTOR	dempsey

In [18]:
# Reformat data for spacy
TRAIN_DATA, LABELS = convert_iob_to_spacy("data/train.txt")

In [19]:
TRAIN_DATA[:10]

[['what movies star bruce willis', {'entities': [(17, 29, 'ACTOR')]}],
 ['show me films with drew barrymore from the 1980s',
  {'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]}],
 ['what movies starred both al pacino and robert deniro',
  {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]}],
 ['find me all of the movies that starred harold ramis and bill murray',
  {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]}],
 ['find me a movie with a quote about baseball in it', {'entities': []}],
 ['what movies have mississippi in the title',
  {'entities': [(17, 28, 'TITLE')]}],
 ['show me science fiction films directed by steven spielberg',
  {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]}],
 ['do you have any thrillers directed by sofia coppola',
  {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]}],
 ['what leonard cohen songs have been used in a movie',
  {'entities': [(5, 24, 'SONG')]}],
 ['show me films elvis films set in hawaii',
  {'entities': [(14, 19, 'ACTOR'), (26

### Split off finetune data
Currently, the dataset is 80% train and 20% test.  Split off another 20% (25% of train) to use to finetune in Doccano.

In [20]:
TRAIN_DATA, FINETUNE_DATA = train_test_split(TRAIN_DATA, test_size=0.25, random_state=0)

### Train Model 0 in spaCy
Initial model training based on train data

In [40]:
nlp = train_ner_model(model=None, train_data=TRAIN_DATA, n_iter=10)

  **kwargs


Losses {'ner': 14856.23525674752}
Losses {'ner': 9051.65772494761}
Losses {'ner': 7421.706851945608}
Losses {'ner': 6602.3245470604015}
Losses {'ner': 6156.108316206479}
Losses {'ner': 5671.798042498967}
Losses {'ner': 5403.023727640749}
Losses {'ner': 5053.1297999462695}
Losses {'ner': 4851.588826170277}
Losses {'ner': 4437.469602505349}


In [41]:
# Save model to output directory
nlp.meta["name"] = "movies_0"
nlp.to_disk('models/movies_0')

### Evaluate Model 0 performance

In [42]:
# Load the unseen test data
TEST_DATA, _ = convert_iob_to_spacy("data/test.txt")

In [43]:
# Load Model 0
movies_ner = spacy.load('models/movies_0')

For each entity in the test data, we can calculate the precision, recall, f-score, and more:
* Precision: true positives / (true positives + false positives)  
* Recall: true positives / (true positives + false negatives)  
* F1-score: a measure of accuracy; the harmonic average of precision and recall

In [44]:
scores_model_0 = evaluate_model(movies_ner, TEST_DATA)

In [45]:
print("Precision: {} \nRecall: {} \nF1-score: {}".format(scores_model_0['ents_p'],
                                                         scores_model_0['ents_r'],
                                                         scores_model_0['ents_f']))

Precision: 84.40779610194903 
Recall: 84.36036710994568 
F1-score: 84.38407494145198


### Run Model 0 on finetune data
Important: Run tuning data through your original model to prevent catastrophic forgetting.

In [46]:
FINETUNE_JSON = convert_spacy_to_doccano(data=FINETUNE_DATA, model=movies_ner)

In [47]:
# export JSONL file
with open('finetune.txt', 'w') as json_file:
    for line in FINETUNE_JSON:
        json_record = json.dumps(line)
        json_file.write(json_record + '\n')

### Run Doccano locally

- Follow instructions at https://github.com/doccano
- Import finetune.txt
- Manually correct annotations
- Export Doccano annotations


### Convert Doccano annotations to spaCy format
- TODO: Use doccano-transformer to convert Doccano's JSONL export to spaCy format

In [48]:
#finetune_doccano = read_jsonl(filepath='file.json1', dataset=NERDataset)

In [49]:
#finetune_spacy = finetune_doccano.to_spacy(tokenizer=)

In [50]:
with open('file.json1','r') as doccano:
    finetune_doccano = []
    for line in doccano.readlines():
        finetune_doccano.append(json.loads(line))

In [51]:
finetune_doccano[:5]

[{'id': 23,
  'text': 'who starred in the man without a face',
  'meta': {},
  'annotation_approver': None,
  'labels': []},
 {'id': 24,
  'text': 'tell me more about the movie elektra',
  'meta': {},
  'annotation_approver': None,
  'labels': [[29, 36, 'TITLE']]},
 {'id': 25,
  'text': 'find a move by director john cleese',
  'meta': {},
  'annotation_approver': None,
  'labels': []},
 {'id': 26,
  'text': 'what year was ghandi filmed',
  'meta': {},
  'annotation_approver': None,
  'labels': []},
 {'id': 27,
  'text': 'i want to get a must see gangster movie from the past three years',
  'meta': {},
  'annotation_approver': None,
  'labels': []}]

In [52]:
# convert to spacy format

In [53]:
FINETUNE_DATA[:5]

[['could you please recommend a good horror film that was made in 2011',
  {'entities': [(34, 40, 'GENRE')]}],
 ['name the documentary by sergio leone',
  {'entities': [(9, 20, 'GENRE'), (24, 36, 'DIRECTOR')]}],
 ['what are some sport films that was released within the last seven years',
  {'entities': [(14, 19, 'GENRE'), (55, 71, 'YEAR')]}],
 ['do you like that really popular zombie move made by director laura gabbert',
  {'entities': [(17, 31, 'RATINGS_AVERAGE'),
    (32, 38, 'PLOT'),
    (61, 74, 'DIRECTOR')]}],
 ['show me that man from rio', {'entities': [(8, 25, 'TITLE')]}]]

### Train Model 1 in spaCy

In [54]:
nlp = train_ner_model(model='models/movies_0', train_data=FINETUNE_DATA, n_iter=10)

Losses {'ner': 3648.0199915609164}
Losses {'ner': 2646.55164874975}
Losses {'ner': 2022.2504103559575}
Losses {'ner': 1687.2290640563742}
Losses {'ner': 1374.053428110652}
Losses {'ner': 1199.4915336527429}
Losses {'ner': 1005.58177761978}
Losses {'ner': 916.226192263246}
Losses {'ner': 845.9147403139978}
Losses {'ner': 780.6000975713796}


In [55]:
# Save model to output directory
nlp.meta["name"] = "movies_1"
nlp.to_disk('models/movies_1')

### Evaluate model performance

In [56]:
# Load Model 1
movies_ner = spacy.load('models/movies_1')

In [57]:
scores_model_1 = evaluate_model(movies_ner, TEST_DATA)

In [58]:
print("Precision: {} \nRecall: {} \nF1-score: {}".format(scores_model_1['ents_p'],
                                                         scores_model_1['ents_r'],
                                                         scores_model_1['ents_f']))

Precision: 83.01462317210348 
Recall: 82.9368795654617 
F1-score: 82.97573315843717


In [59]:
# compare to Model 0
print("Precision: {} \nRecall: {} \nF1-score: {}".format(scores_model_0['ents_p'],
                                                         scores_model_0['ents_r'],
                                                         scores_model_0['ents_f']))

Precision: 84.40779610194903 
Recall: 84.36036710994568 
F1-score: 84.38407494145198
