<a href="https://colab.research.google.com/github/hamza1886/fda-bert-ner/blob/master/phase-3/custom_ner_with_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import time
from itertools import chain
from os import path, mkdir

import matplotlib.pyplot as plt
import numpy as np
import spacy
import thinc_gpu_ops
from matplotlib.ticker import MaxNLocator
from spacy import displacy
from spacy.util import minibatch, compounding

In [2]:
def load_data_spacy(file_path):
    """ Converts data from:
    label \t word \n label \t word \n \n label \t word
    to: sentence, {entities : [(start, end, label), (start, end, label)]}
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        file = f.readlines()

    training_data, entities, sentence, unique_labels = [], [], [], []
    start, end = 0, 0  # initialize counter to keep track of start and end characters

    for line in file:
        line = line.strip('\n').split('\t')
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            label_type = label[0]  # beginning of annotations - "B-xxx", intermediate - "I-xxx"

            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)  # length of the word + trailing space

            if label_type == 'I':  # if at the end of an annotation
                entities.append((start, end - 1, label))  # append the annotation
            if label_type == 'B':  # if beginning new annotation
                entities.append((start, end - 1, label))  # start annotation at beginning of word

            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)

        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if len(entities) > 0:
                sentence = ' '.join(sentence)
                training_data.append([sentence, {'entities': entities}])
            # reset the counters and temporary lists
            start, end = 0, 0
            entities, sentence = [], []

    return training_data, unique_labels

In [3]:
def load_model(model_path):
    """ Loads a pre-trained model for prediction on new test sentences

    model_path : directory of model saved by spacy.to_disk
    """
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)

    ner = nlp.from_disk(model_path)
    return ner

In [4]:
TRAIN_DATA, LABELS = load_data_spacy('data/train.tsv')  # 60% of ../description/all-description.txt
VALID_DATA, _ = load_data_spacy('data/devel.tsv')  # 20% of ../description/all-description.txt
TEST_DATA, _ = load_data_spacy('data/test.tsv')  # 20% of ../description/all-description.txt

In [5]:
# Let's test our model on test data
ner = load_model('models/spacy_example')

test_sentences = [x[0] for x in TEST_DATA[:100]]  # extract the sentences from [sentence, entity]
for test_sentence in test_sentences:
    doc = ner(test_sentence)
    # for ent in doc.ents:
    #     print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc, jupyter=True, style='ent')