# Topic Extraction Training

In [10]:
import random
import spacy

nlp = spacy.load('en_core_web_sm')

## Get the Keywords

In [11]:
text = ['A real-time integrated data logistics and simple event processing platform Apache NiFi ' + \
        'automates the movement of data between disparate data sources and sy.']

In [12]:
keywords = []
for _, t in enumerate(text):
    kw = []
    parsed_text = nlp(t)
    # taking all named entities as keywords
    for entity in parsed_text.ents:
        #kw.append("%s(%s)" % (entity.text, entity.label_))
        kw.append(entity.text)
    for pt in parsed_text:
        # taking just part of speech tags as keywords
        if pt.dep_ in ('pobj', 'dobj', 'conj', 'compound') and pt.pos_ in ('NOUN') and pt.tag_ not in ('WP'):
            #kw.append("%s(%s,%s)"%(pt, pt.pos_, pt.tag_)) 
            kw.append(str(pt)) 
    # make the list of keyword unique when name entities and objects makes duplicity
    kw = list(set(kw))
    keywords.append(', '.join(str(k) for k in kw))

In [13]:
keywords

['Apache NiFi, sources, movement, data, platform, event, processing']

As we can see `NiFi` it self is missing, nobody will use `Apache Nifi` in conversation. So, let's train it.

## Train missing Keywords

Let's prepare training data containing keywords which we want to train the model and it's entity definitions by position and known NER keywords.

In [14]:
train_data = [
    ('Apache NiFi (short for NiagaraFiles) is a software project from the Apache Software Foundation ' + \
        'designed to automate the flow of data between software systems.', {
        'entities': [(0,6,'ORG'),(7,11,'PRODUCT')]
    })
]

Create the built-in pipeline components and add them to the pipeline `nlp.create_pipe` works for built-ins that are registered with spaCy. Otherwise we need to get it, so we can add labels.

In [15]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Add labels to the NER pipeline.

In [16]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

Get names of other pipes to disable them during training.

In [17]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for i in range(20):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

{'ner': 4.4237863973823455}
{'ner': 3.8326782247286757}
{'ner': 3.3267226160073076}
{'ner': 1.0775559683207607}
{'ner': 2.9021405209103617}
{'ner': 4.1294674029182659}
{'ner': 4.3269160880255733}
{'ner': 3.4417642318368848}
{'ner': 3.0216557824286441}
{'ner': 1.719915857021167}
{'ner': 3.2125682612963056}
{'ner': 1.4646328903200989}
{'ner': 3.2772178058582129}
{'ner': 0.86539458591505125}
{'ner': 1.8290357937245514}
{'ner': 1.752638645456996}
{'ner': 1.4776314778831379}
{'ner': 2.4013956641863157}
{'ner': 1.9639375780212429}
{'ner': 1.2426677611738506}


Test the trained model

In [18]:
for text, _ in train_data:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_) for t in doc])


Entities [('Apache', 'ORG'), ('NiFi', 'PRODUCT')]
Tokens [('Apache', 'ORG'), ('NiFi', 'PRODUCT'), ('(', ''), ('short', ''), ('for', ''), ('NiagaraFiles', ''), (')', ''), ('is', ''), ('a', ''), ('software', ''), ('project', ''), ('from', ''), ('the', ''), ('Apache', ''), ('Software', ''), ('Foundation', ''), ('designed', ''), ('to', ''), ('automate', ''), ('the', ''), ('flow', ''), ('of', ''), ('data', ''), ('between', ''), ('software', ''), ('systems', ''), ('.', '')]


So, it recognizes now two new entities `Apache` as the organization and `NiFi` as the product.
