In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [24]:
import json

with open('training_data.json', 'r') as f:
    data = json.load(f)

print(data['examples'][0])

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed', 'content': 'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.', 'metadata': {}, 'annotations': [{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed', 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1', 'end': 98, 'start': 40, 'example_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed', 'tag_name': 'University', 'value': 'Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie', 'correct': None, 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z', 'annotator_id': 1, 'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed', 'name': 'Ashpat123', 'reason': 'exploration'}], 'model_annotations': []}], 'classifications': []}


In [25]:
training_data = {'classes' : ['UNIVERSITY'], 'annotations' : []}
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data['annotations'].append(temp_dict)

print(training_data['annotations'][0])

{'text': 'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.', 'entities': [(40, 98, 'UNIVERSITY')]}


In [26]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("pl") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [27]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

100%|██████████| 14/14 [00:00<00:00, 1664.26it/s]


In [97]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("Dzień dobry, interesuję się studiami na Gdańskim Uniwersytecie Medycznym. Czy mogę dowiedzieć się, jakie kierunki są dostępne na tej uczelni?")

colors = {"UNIVERSITY": "#F67DE3"}
options = {"colors": colors}


ValueError: [E002] Can't find factory for 'transformer' for language Polish (pl). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer, pl.lemmatizer

In [39]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [82]:
text = u'politechnika krakowska jest super'
doc = nlp(text)

show_ents(doc)

No named entities found.


In [80]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create the desired phrase patterns:
phrase_list = ['politechnika krakowska', 'politechnika-krakowska']
phrase_patterns = [nlp(text) for text in phrase_list]



In [81]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
university = doc.vocab.strings[u'UNIVERSITY']

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=university)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

ValueError: [E1010] Unable to set entity information for token 0 which is included in more than one span in entities, blocked, missing or outside.

In [67]:
new_ent

politechnika

In [68]:
phrase_patterns

[politechnika krakowska, politechnika-krakowska]

In [70]:
# Apply the patterns to our matcher object:
matcher.add('poli', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches


[(7253297147861999825, 0, 2), (2156668014677468499, 0, 2)]

In [73]:

# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'UNIVERSISTY']

In [79]:
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]
new_ents

ValueError: [E084] Error assigning label ID 15735286411066341544 to span: not in StringStore.

In [74]:
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

show_ents(doc)

ValueError: [E084] Error assigning label ID 15735286411066341544 to span: not in StringStore.

In [92]:
doc = nlp(u'Politechnika Krakowska jest super.')

show_ents(doc)
text = u'politechnika krakowska jest super.'

No named entities found.


In [93]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [94]:
# Create the desired phrase patterns:
phrase_list = ['politechnika krakowska', 'politechnika-krakowska']
phrase_patterns = [nlp(text) for text in phrase_list]

In [95]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[]