In [1]:
import spacy
import spacy_transformers

In [2]:
!python3 -m spacy download pl_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting pl-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.6.0/pl_core_news_sm-3.6.0-py3-none-any.whl (20.2 MB)
[K     |████████████████████████████████| 20.2 MB 2.2 MB/s eta 0:00:01
Installing collected packages: pl-core-news-sm
Successfully installed pl-core-news-sm-3.6.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')


In [3]:
nlp = spacy.load("pl_core_news_sm")
nlp

<spacy.lang.pl.Polish at 0x28d2ba4a0>

In [34]:
doc = nlp("Stolica polski jest Krakow")

In [35]:
doc

Stolica polski jest Krakow

In [36]:
type(doc)


spacy.tokens.doc.Doc

In [37]:
doc.ents

()

In [38]:
doc.ents[0], type(doc.ents[0])

IndexError: tuple index out of range

In [39]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [40]:
import json
# https://www.kaggle.com/datasets/finalepoch/medical-ner
with open('data.json', 'r') as f:
    data = json.load(f)

In [41]:
data['examples'][0]


{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'content': 'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.',
 'metadata': {},
 'annotations': [{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
   'end': 125,
   'start': 64,
   'example_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'tag_name': 'University',
   'value': 'Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie',
   'correct': None,
   'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
     'annotator_id': 1,
     'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
     'name': 'Ashpat123',
     'reason': 'exploration'}],
   'model_annotations': []}],
 'classifications': []}

In [42]:
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [43]:
data['examples'][0]['content']

'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.'

In [44]:

training_data = []
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

print(training_data[0])

{'text': 'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.', 'entities': [(64, 125, 'UNIVERSITY')]}


In [45]:
training_data[0]['text']

'Cześć, jestem osobą szukającą informacji o studiach na uczelni Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie.'

In [46]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("pl") # load a new spacy model
doc_bin = DocBin()

In [47]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")


100%|██████████| 14/14 [00:00<00:00, 2897.33it/s]


In [48]:
!python3 -m spacy init fill-config base_config.cfg config.cfg


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [49]:
!python3 -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     35.33    0.00    0.00    0.00    0.00
 61     200         96.28   1006.99  100.00  100.00  100.00    1.00
130     400          0.00      0.00  100.00  100.00  100.00    1.00
230     600          0.00      0.00  100.00  100.00  100.00    1.00
330     800          0.00      0.00  100.00  100.00  100.00    1.00
453    1000          0.00      0.00  100.00  100.00  100.00    1.00
653    1200          0.00      0.00  100.00  100.00  100.00    1.00
853    1400          0.00      0.00  100.00  100.00  100.00    1.00
1053    1600          0.00      0.00  1

In [50]:
nlp_ner = spacy.load("model-best")

In [62]:
doc = nlp_ner("Dzień dobry, interesuję się studiami na Akademii Teatralnej im. Aleksandra Zelwerowicza w Warszawie. Czy m")

spacy.displacy.render(doc, style="ent", jupyter=True)

In [63]:
doc.ents

(Teatralnej im. Aleksandra Zelwerowicza w Warszawie.,)