# Właściwy proces uczenia

In [29]:
import spacy
import json
import random
from spacy.tokens import Doc
from spacy.training import Example

Ustawienie katalogów i nazw plików.

In [30]:
import os
model_dir = os.path.join("..", "model")
model = os.path.join(model_dir, "pl_streets_ner_model")

src_dir = os.path.join("..", "data")
src_streets = os.path.join(src_dir, "streets.json")

Standardowe metody ładowania i zapisywania danych w formacie JSON.

In [31]:
def load_json_data(file: str):
    with open(file, 'tr', encoding="UTF-8") as json_file:
        data = json.load(json_file)
    return data

def save_json_data(file: str, json_data):
    with open(file, 'tw', encoding="UTF-8") as outfile:
        json.dump(json_data, outfile, indent=4, ensure_ascii=False)

In [32]:
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("pl")
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], drop=0.2, sgd=optimizer, losses=losses)
            print(losses)
    return (nlp)

Załadowanie danych treningowych.

In [33]:
TRAIN_DATA = load_json_data("train.json")
nlp = train_spacy(TRAIN_DATA, 30)
nlp.to_disk(model)

Starting iteration 0
{'ner': 5642.2146731910925}
Starting iteration 1
{'ner': 1533.369599014779}
Starting iteration 2
{'ner': 967.2621679443416}
Starting iteration 3
{'ner': 687.3161548537772}
Starting iteration 4
{'ner': 482.79542363846303}
Starting iteration 5
{'ner': 381.9042808238548}
Starting iteration 6
{'ner': 318.4234511056336}
Starting iteration 7
{'ner': 204.94598146332615}
Starting iteration 8
{'ner': 188.34409264638705}
Starting iteration 9
{'ner': 181.33440931631054}
Starting iteration 10
{'ner': 139.65806893477549}
Starting iteration 11
{'ner': 107.03276112182088}
Starting iteration 12
{'ner': 112.82203863986473}
Starting iteration 13
{'ner': 116.49646073214248}
Starting iteration 14
{'ner': 132.48624569827817}
Starting iteration 15
{'ner': 62.851887634806346}
Starting iteration 16
{'ner': 68.39062002313612}
Starting iteration 17
{'ner': 83.22005679233891}
Starting iteration 18
{'ner': 63.524630163457765}
Starting iteration 19
{'ner': 98.94152670686191}
Starting iteration