In [1]:
#!/usr/bin/env python
# coding: utf8
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from tqdm.auto import tqdm

In [2]:
CFG = {'device': 1, 'cpu_count': 4}
TESTS = False
spacy.require_gpu()

True

In [3]:
import gzip, json
def load_entries(fn): # '../data/datasets/nerus.jsonl.gz'
    entries = []
    with gzip.open(fn, 'r') as f:
        for line in tqdm(f):
            entry = json.loads(line)
            entries.append(entry)
    return entries
    #del entries

In [4]:
class Corpus:
    ents = {}
    ds_train = []
    ds_test = []

In [5]:
KR = Corpus()
KR.ents = {'CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PHONE', 'QUANTITY', 'TIME'}
KR.ds_test = load_entries('../data/datasets/kaggle_ru_test.jsonl.gz')
KR.ds_train = load_entries('../data/datasets/kaggle_ru_train.jsonl.gz')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [6]:
KR.ents = sorted(set(v[2] for x in KR.ds_train for v in x['entities']))
display(KR.ents)

['CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PHONE', 'QUANTITY', 'TIME']

In [7]:
NERUS = Corpus()
NERUS.ents = {'ORG', 'PER', 'LOC'}
NERUS.ds_test = load_entries('../data/datasets/nerus_test.jsonl.gz')
NERUS.ds_train = load_entries('../data/datasets/nerus_train.jsonl.gz')

# BUGFIX for v0.5
NERUS.ds_test = [{'raw': x['raw'], 'entities': x['entries']} for x in tqdm(NERUS.ds_test)]
NERUS.ds_train = [{'raw': x['raw'], 'entities': x['entries']} for x in tqdm(NERUS.ds_train)]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=719295.0), HTML(value='')))




In [8]:
CORPORA = [KR, NERUS]

In [9]:
for c in CORPORA:
    print(len(c.ents), len(c.ds_train), len(c.ds_test))

7 1087147 20000
3 719295 20000


In [10]:
def get_other_pipes(nlp, x):
    return [pipe for pipe in nlp.pipe_names if pipe != "ner"]

In [11]:
def pluck(dict_list, attr):
    r = []
    for e in dict_list:
        r.append(e[attr])
    return r

def pluck_list(dict_list, *attrs):
    r = []
    for e in dict_list:
        r.append([e[a] for a in attrs])
    return r

def pluck_dict(dict_list, *attrs):
    r = []
    for e in dict_list:
        r.append({a: e[a] for a in attrs})
    return r

In [12]:
def setup_model():
    print("Creating new model...")
    nlp = spacy.blank('ru')
    return nlp
nlp = setup_model()
from tokenizer import set_stemming_tokenizer
set_stemming_tokenizer(nlp)

Creating new model...


In [13]:
def add_ner(nlp, labels, rebuild=True):
    if 'ner' in nlp.pipe_names and rebuild:
        nlp.disable_pipes('ner')
    if 'ner' not in nlp.pipe_names:
        print("Creating new NER...")
        nlp_ner = nlp.create_pipe('ner')
        nlp.add_pipe(nlp_ner)
    else:
        print("Using existing NER...")
        nlp_ner = nlp.get_pipe('ner')
    print(nlp.pipeline)
    for l in labels:
        nlp_ner.add_label(l)
    assert set(labels) <= set(nlp_ner.labels)
    return nlp_ner
nlp_ner = add_ner(nlp, [l for c in CORPORA for l in c.ents], rebuild=True)
print(nlp_ner.labels)

assert nlp.get_pipe('ner') == nlp_ner

Creating new NER...
[('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f8b3a964528>)]
('TIME', 'DATE', 'MONEY', 'ORG', 'PHONE', 'LOC', 'QUANTITY', 'PER', 'ORDINAL', 'CARDINAL')


In [14]:
def tqdm_batches(batches, total=None, leave=True, **info):
    infostr = ', '.join([f"{k}={v}" for k,v in info.items()])
    ll = 0
    batch_iter = tqdm(total=total, leave=leave)
    for batch in batches:
        bl = len(batch)
        if bl > ll:
            batch_iter.set_description(f"bsz={bl} "+infostr)
            ll = bl
        yield batch
        batch_iter.update(bl)
    batch_iter.close()

if TESTS:
    import time
    dataset = range(50000)
    batches = minibatch(dataset, size=compounding(1., 32., 1.0005))
    for b in tqdm_batches(batches, total=50000, epoch=1):
        time.sleep(0.001)

In [33]:
def unseen_names(nlp, unseen):
    ner = nlp.get_pipe('ner')
    return [mt for i, mt in enumerate(ner.move_names) if i in unseen]
    
def enable_all_entities(nlp):
    ner = nlp.get_pipe('ner')
    ner.model.unseen_classes = set()

def enable_entities(nlp, labels):
    ner = nlp.get_pipe('ner')
    # print("Unseen classes were:", unseen_names(nlp, ner.model.unseen_classes))
    unseen = set()
    for i, mt in enumerate(ner.move_names):
        if '-' in mt:
            l = mt.split('-', 1)[1]
            # print(mt, l)
            if l not in labels:
                unseen.add(i)
    # print("Set unseen classes to:", unseen_names(nlp, unseen))
    ner.model.unseen_classes = unseen

if TESTS:
    print('Training', KR.ents)
    enable_entities(nlp, KR.ents)

In [16]:
import spacy
import random
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import pandas
pandas.set_option('display.precision', 3) 

def _evaluate(model, batches):
    if model.get_pipe('ner').model is True:
        print("Initializing model!")
        model.begin_training(**CFG)
    scorer = Scorer()
    for batch in batches:
        docs = pluck(batch, 'raw')
        docs = model.pipe(docs)
        for doc, parse in zip(docs, batch):
            scorer.score(doc, GoldParse(doc, entities=parse['entities']))
    return scorer.scores

def evaluate(model, dataset, batch_size=32):
    batches = tqdm_batches(minibatch(dataset, batch_size), total=len(dataset), leave=False)
    return _evaluate(model, batches)

def evaluate_data_source(model, ds, count=None, batch_size=32):
    enable_entities(model, ds.ents)
    if count:
        dataset = random.sample(ds.ds_test, count)
    else:
        dataset = ds.ds_test
    res = evaluate(model, dataset)
    return {k:v for k,v in res['ents_per_type'].items() if k in ds.ents}

def display_ents(list_of_scores):
    display(pandas.DataFrame.from_records(list_of_scores).T)

if TESTS or 0:
    res = evaluate(nlp, NERUS.ds_test, 1000)
    print(pluck_dict([res], 'token_acc', 'tags_acc', 'uas', 'las'))
    display_ents(res['ents_per_type'])

if TESTS or 0:
    scores = evaluate_data_source(nlp, NERUS, count=1000, batch_size=32)
    display_ents(scores)

In [17]:
import numpy

def _train_epoch(model, labels, batches):
    with model.disable_pipes(*get_other_pipes(nlp, 'ner')):
        if model.get_pipe('ner').model is True:
            print("Initializing model!")
            model.begin_training(**CFG)
        optimizer = model.resume_training(**CFG)
        losses = {}
        n_docs = 0
        for batch in batches:
            texts = pluck(batch, 'raw')
            anns = pluck_dict(batch, 'entities')
            enable_entities(model, labels)
            model.update(texts, anns, drop=0.2, losses=losses, sgd=optimizer)
            n_docs += len(batch)
        meta = {
            'docs': n_docs,
            'loss': {k: numpy.log(1e-10 + (v / n_docs)) for k,v in losses.items()},
        }
    enable_all_entities(model)
    return meta

def train_epoch(model, ds, batch_size, count=None):
    if count is None:
        dataset = ds.ds_train.copy()
        random.shuffle(dataset)
    else:
        dataset = random.sample(ds.ds_train, count)
    batches = minibatch(dataset, size=size_)
    _train_epoch(nlp, ds.ents, tqdm_batches(batches, total=len(dataset)))

if TESTS or 0:
    size_ = compounding(1., 32., 1.001)
    train_epoch(nlp, NERUS, batch_size=size_, count=1000)

if TESTS or 0:
    size_ = compounding(1., 32., 1.001)
    train_epoch(nlp, KR, batch_size=size_, count=1000)

if TESTS or 0:
    res = {}
    for c in CORPORA:
        res.update(evaluate_data_source(nlp, c, 1000))
    display_ents(res)

In [18]:
size_ = compounding(1., 32., 1.0005)
for e in tqdm(range(15)):
    for ds_train in CORPORA:
        train_epoch(nlp, ds_train, batch_size=size_, count=50000)
    res = {}
    for c in CORPORA:
        res.update(evaluate_data_source(nlp, c, 1000))
    display_ents(res)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

Initializing model!


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,0.0,0.0,0.0
DATE,0.0,0.0,0.0
LOC,95.265,94.587,94.925
MONEY,0.0,0.0,0.0
ORDINAL,0.0,0.0,0.0
ORG,85.581,91.595,88.486
PER,92.656,91.883,92.268
PHONE,0.0,0.0,0.0
QUANTITY,0.0,0.0,0.0
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,0.0,0.0,0.0
DATE,0.0,0.0,0.0
LOC,96.763,95.639,96.198
MONEY,0.0,0.0,0.0
ORDINAL,0.0,0.0,0.0
ORG,91.864,90.708,91.282
PER,93.502,94.363,93.93
PHONE,0.0,0.0,0.0
QUANTITY,0.0,0.0,0.0
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,4.142,1.392,2.083
DATE,8.696,0.546,1.028
LOC,95.978,96.377,96.177
MONEY,0.0,0.0,0.0
ORDINAL,0.0,0.0,0.0
ORG,90.568,92.58,91.563
PER,94.034,94.761,94.396
PHONE,0.0,0.0,0.0
QUANTITY,0.0,0.0,0.0
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,1.235,0.18,0.314
DATE,38.776,5.46,9.572
LOC,96.09,96.11,96.1
MONEY,0.0,0.0,0.0
ORDINAL,2.778,2.532,2.649
ORG,91.542,92.411,91.974
PER,94.368,95.258,94.811
PHONE,3.704,5.556,4.444
QUANTITY,0.0,0.0,0.0
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,23.563,8.135,12.094
DATE,48.227,20.299,28.571
LOC,95.146,97.235,96.179
MONEY,0.0,0.0,0.0
ORDINAL,11.765,13.084,12.389
ORG,93.124,90.481,91.783
PER,96.237,93.98,95.095
PHONE,75.0,21.429,33.333
QUANTITY,100.0,1.351,2.667
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,42.079,17.932,25.148
DATE,90.476,22.485,36.019
LOC,96.635,97.298,96.965
MONEY,0.0,0.0,0.0
ORDINAL,25.0,25.301,25.15
ORG,89.693,94.513,92.04
PER,95.778,95.778,95.778
PHONE,20.93,50.0,29.508
QUANTITY,83.333,6.579,12.195
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,62.5,15.091,24.311
DATE,60.759,14.035,22.803
LOC,97.887,96.345,97.11
MONEY,0.0,0.0,0.0
ORDINAL,11.111,5.128,7.018
ORG,93.114,93.805,93.458
PER,93.97,95.812,94.882
PHONE,85.714,37.5,52.174
QUANTITY,100.0,6.25,11.765
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,79.878,24.905,37.971
DATE,76.821,34.627,47.737
LOC,96.921,96.814,96.867
MONEY,0.0,0.0,0.0
ORDINAL,30.909,21.795,25.564
ORG,91.574,93.263,92.411
PER,94.612,95.941,95.272
PHONE,94.737,75.0,83.721
QUANTITY,91.667,14.103,24.444
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,85.616,27.293,41.391
DATE,84.259,27.576,41.553
LOC,95.693,97.96,96.813
MONEY,0.0,0.0,0.0
ORDINAL,56.364,32.979,41.611
ORG,93.883,92.849,93.363
PER,95.525,95.677,95.601
PHONE,66.667,46.154,54.545
QUANTITY,81.818,13.636,23.377
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,96.429,39.488,56.031
DATE,83.051,28.242,42.151
LOC,96.56,97.393,96.975
MONEY,0.0,0.0,0.0
ORDINAL,62.857,24.444,35.2
ORG,94.444,91.969,93.19
PER,95.702,96.44,96.069
PHONE,71.429,83.333,76.923
QUANTITY,93.333,16.471,28.0
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,92.834,52.198,66.823
DATE,92.982,30.904,46.389
LOC,97.788,97.134,97.46
MONEY,0.0,0.0,0.0
ORDINAL,30.952,19.697,24.074
ORG,92.273,93.891,93.075
PER,95.869,96.842,96.353
PHONE,94.118,80.0,86.486
QUANTITY,95.918,53.409,68.613
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,93.488,40.12,56.145
DATE,95.361,57.453,71.705
LOC,97.222,97.523,97.372
MONEY,0.0,0.0,0.0
ORDINAL,74.074,29.851,42.553
ORG,92.335,93.953,93.137
PER,94.917,96.416,95.661
PHONE,91.667,78.571,84.615
QUANTITY,85.714,31.579,46.154
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,96.711,30.435,46.299
DATE,97.63,63.385,76.866
LOC,96.922,98.101,97.508
MONEY,0.0,0.0,0.0
ORDINAL,62.857,29.73,40.367
ORG,91.52,94.342,92.909
PER,96.858,96.303,96.58
PHONE,88.889,84.211,86.486
QUANTITY,100.0,47.312,64.234
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,96.241,54.936,69.945
DATE,97.531,45.143,61.719
LOC,97.305,97.399,97.352
MONEY,0.0,0.0,0.0
ORDINAL,97.5,48.75,65.0
ORG,91.93,94.986,93.433
PER,95.566,96.294,95.929
PHONE,100.0,60.87,75.676
QUANTITY,97.727,55.844,71.074
TIME,0.0,0.0,0.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,87.591,47.809,61.856
DATE,96.635,63.407,76.571
LOC,97.688,97.706,97.697
MONEY,0.0,0.0,0.0
ORDINAL,82.927,50.0,62.385
ORG,90.652,95.166,92.854
PER,95.537,96.442,95.987
PHONE,100.0,90.0,94.737
QUANTITY,94.872,48.684,64.348
TIME,66.667,66.667,66.667





In [19]:
size_ = compounding(1., 32., 1.0005)
for e in tqdm(range(15)):
    for ds_train in CORPORA[::-1]:
        train_epoch(nlp, ds_train, batch_size=size_, count=50000)
    res = {}
    for c in CORPORA:
        res.update(evaluate_data_source(nlp, c, 1000))
    display_ents(res)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,96.8,99.18,97.976
DATE,99.437,99.718,99.577
LOC,0.0,0.0,0.0
MONEY,50.0,25.0,33.333
ORDINAL,95.062,89.535,92.216
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,78.571,88.0
QUANTITY,96.053,97.333,96.689
TIME,50.0,100.0,66.667


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.413,98.239,98.326
DATE,99.365,99.365,99.365
LOC,0.0,0.0,0.0
MONEY,100.0,100.0,100.0
ORDINAL,95.652,95.652,95.652
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,96.154,100.0,98.039
QUANTITY,100.0,98.387,99.187
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.024,96.26,96.64
DATE,98.936,99.465,99.2
LOC,0.0,0.0,0.0
MONEY,62.5,71.429,66.667
ORDINAL,92.784,94.737,93.75
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,95.0,97.436
QUANTITY,95.181,96.341,95.758


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.729,97.694,98.209
DATE,99.728,99.728,99.728
LOC,0.0,0.0,0.0
MONEY,100.0,100.0,100.0
ORDINAL,97.297,96.0,96.644
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,96.667,96.667,96.667
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,95.543,98.207,96.857
DATE,99.687,100.0,99.843
LOC,0.0,0.0,0.0
MONEY,100.0,83.333,90.909
ORDINAL,100.0,97.701,98.837
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,94.186,94.186,94.186
TIME,100.0,75.0,85.714


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.012,95.914,96.952
DATE,100.0,100.0,100.0
LOC,0.0,0.0,0.0
MONEY,100.0,100.0,100.0
ORDINAL,98.864,95.604,97.207
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,93.75,100.0,96.774
QUANTITY,96.203,100.0,98.065


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.856,95.802,96.818
DATE,99.396,99.396,99.396
LOC,0.0,0.0,0.0
MONEY,100.0,100.0,100.0
ORDINAL,94.203,95.588,94.891
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,93.75,96.774
QUANTITY,92.958,97.059,94.964
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,96.712,97.656,97.182
DATE,99.704,100.0,99.852
LOC,0.0,0.0,0.0
MONEY,100.0,62.5,76.923
ORDINAL,95.833,98.571,97.183
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,94.444,94.444,94.444
QUANTITY,100.0,97.895,98.936
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.386,97.812,97.598
DATE,100.0,99.699,99.849
LOC,0.0,0.0,0.0
MONEY,100.0,66.667,80.0
ORDINAL,96.552,96.552,96.552
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,92.958,97.059,94.964
TIME,100.0,75.0,85.714


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.315,98.131,98.223
DATE,99.429,99.713,99.571
LOC,0.0,0.0,0.0
MONEY,100.0,66.667,80.0
ORDINAL,98.649,92.405,95.425
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,94.118,100.0,96.97
TIME,100.0,66.667,80.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.02,97.441,97.73
DATE,99.714,100.0,99.857
LOC,0.0,0.0,0.0
MONEY,100.0,100.0,100.0
ORDINAL,96.104,98.667,97.368
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,95.833,100.0,97.872
QUANTITY,94.937,98.684,96.774
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.932,98.488,98.209
DATE,99.725,100.0,99.862
LOC,0.0,0.0,0.0
MONEY,100.0,83.333,90.909
ORDINAL,97.727,93.478,95.556
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,96.226,100.0,98.077


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.576,97.576,97.576
DATE,99.724,100.0,99.862
LOC,0.0,0.0,0.0
MONEY,100.0,87.5,93.333
ORDINAL,94.203,94.203,94.203
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,100.0,100.0
QUANTITY,95.89,93.333,94.595
TIME,100.0,100.0,100.0


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.2,96.654,97.421
DATE,98.864,99.429,99.145
LOC,0.0,0.0,0.0
MONEY,80.0,57.143,66.667
ORDINAL,94.318,94.318,94.318
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,95.833,97.872
QUANTITY,93.22,100.0,96.491
TIME,62.5,100.0,76.923


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,98.431,98.431,98.431
DATE,98.512,99.399,98.954
LOC,0.0,0.0,0.0
MONEY,85.714,85.714,85.714
ORDINAL,97.403,93.75,95.541
ORG,0.0,0.0,0.0
PER,0.0,0.0,0.0
PHONE,100.0,95.0,97.436
QUANTITY,94.872,97.368,96.104
TIME,100.0,50.0,66.667





In [35]:
size_ = compounding(4., 32., 1.0005)
for e in tqdm(range(1)):
    for ds_train in CORPORA:
        train_epoch(nlp, ds_train, batch_size=size_, count=10000)
    res = {}
    for c in CORPORA:
        res.update(evaluate_data_source(nlp, c, 1000))
    display_ents(res)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Unnamed: 0,p,r,f
CARDINAL,97.645,86.364,91.658
DATE,95.783,98.758,97.248
LOC,97.174,97.194,97.184
MONEY,100.0,40.0,57.143
ORDINAL,97.468,85.556,91.124
ORG,91.104,94.581,92.81
PER,95.173,97.269,96.21
PHONE,90.0,94.737,92.308
QUANTITY,94.286,92.958,93.617
TIME,60.0,75.0,66.667





In [40]:
import spacy.displacy
def view_example(nlp, s):
    print('Text:', s['raw'])
    doc = nlp(s['raw'])
#     print("Actual:", [(e, e.label_) for e in doc.ents])
    print("Expected:", [(s['raw'][a:b],c,a,b) for a,b,c in s['entities']])
    spacy.displacy.render(doc, style='ent')

enable_all_entities(nlp)

for s in NERUS.ds_test[:2]:
    view_example(nlp, s)
for s in KR.ds_test[:3]:
    view_example(nlp, s)

Text: В Москве началось шествие в поддержку кандидата в президенты от "Единой России" Владимира Путина, сообщает корреспондент "Ленты.ру". Колонны двинулись по Фрунзенской набережной в сторону "Лужников", где состоится митинг. На сайте РИА Новости ведется видеотрансляция с мероприятия. "Интерфакс" отмечает, что участники шествия идут по набережной под патриотическую музыку, размахивая флагами. Активисты, по словам заместителя главы предвыборного штаба Путина Алексея Анисимова, заполнили всю Фрунзенскую набережную. "Точным подсчетом занимаются правоохранительные органы", - отметил он. По данным ГУ МВД по Москве, в шествии принимают участие 30 тысяч человек. Когда шествие стартовало, полиция перекрыла проход к месту сбора, отмечает РИА Новости. Очевидцы подтверждают эту информацию в твиттере. "Всех разворачивают. Говорят, что теперь только пешком до Лужи", - пишет, например, пользователь @_Mohandas_. Как заявил агентству  Анисимов, в "Лужниках", тем временем, уже собрались около 10 тысяч

Text: Двукратный олимпийский чемпион по хоккею Владимир Петров назвал кандидатуры пяти тренеров, которые могли бы заменить наставника сборной России Вячеслава Быкова. По словам Петрова, это могли бы сделать Зинэтула Билялетдинов, Андрей Назаров, Федор Канарейкин, Владимир Крикунов или Борис Михайлов, сообщает "Советский спорт". Петров считает, что будет лучше, если главный тренер сборной будет освобожден от работы в клубе и будет заниматься только делами национальной команды. Кроме того, по мнению двукратного олимпийского чемпиона, Федерация хоккея России (ФХР) должна составить контракт тренера таким образом, чтобы в нем были четко прописаны задачи, поставленные перед наставником. Несмотря на то что Петров назвал имена возможных преемников Быкова, он не сказал, должен ли нынешний главный тренер сборной России покинуть свой пост. На прошлой неделе было опубликовано открытое письмо ветеранов российского и советского хоккея с требованием уволить из сборной Игоря Захаркина - помощника Быко

Text: В январе 1966 года по поручению Министерства торговли СССР Сысоев возглавил архитектурное проектирование во вновь созданном институте «Белгипроторг».
Expected: [('1966 года', 'DATE', 9, 18)]


Text: Проверено 15 июня 2010. Архивировано из первоисточника 27 апреля 2012. Need for Speed: Carbon Own the City (англ.).
Expected: [('15 июня 2010', 'DATE', 10, 22), ('27 апреля 2012', 'DATE', 55, 69)]


Text: Многие эти песни повлияли на движения за гражданские права и на многих музыкантов.
Expected: []


  "__main__", mod_spec)


In [22]:
from pathlib import Path
# save model to output directory
def save_model(nlp, output_dir):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
save_model(nlp, '../ru2_ner_final')


AttributeError: 'function' object has no attribute 'to_disk'

In [31]:
nlp.tokenizer('приветы всем'.split())

TypeError: Argument 'string' has incorrect type (expected str, got list)

In [46]:
spacy.displacy.render(nlp('20 декабря 2019 года на улице Советской, город Новосибирск, мы с Сашей пошли гулять'), 
                      style='ent')