# Install relevant packages

In [18]:
!pip install torch torchvision Flask spacy
!python -m spacy download de_core_news_sm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# Top level config

In [185]:
#model = 'de_core_news_sm'
model = None

batch_size = 400

iterations = 2

output_dir = f'./spacy_gerNER_updated_{batch_size}_{iterations}_{model}'

# Imports

In [96]:
import pandas as pd
import numpy as np
import spacy
import random
import re

from spacy.training import Example
from pathlib import Path
from tqdm import tqdm

# Read data

In [101]:
train_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-train.tsv', sep='\t', on_bad_lines='skip', engine='python',  names=['tokens', 'NER', 'delete'])
dev_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-dev.tsv',sep='\t', names=['tokens', 'NER', 'delete'], on_bad_lines='skip')
test_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-test.tsv',sep='\t', names=['tokens', 'NER', 'delete'], on_bad_lines='skip', engine='python')

# Methods

In [122]:
def clean_test_data(data):

  data = data[data.index != '#']

  data = data.iloc[:, :-1]

  return data

In [123]:
def build_model_data(data):

    data = data[data.index != '#']

    data = data.iloc[:, :-1]

    data['NER'] = data['NER'].str.replace(r'\S+PER\w*', 'PER', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+LOC\w*', 'LOC', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+ORG\w*', 'ORG', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+OTH\w*', 'MISC', regex=True)

    data_processed = []
    sentences = []
    block_sent = []
    block_ents = []

    round = 0
    current_pos = 0

    for row in data.iterrows():
        if int(row[0]) == 1 and round != 0:
            current_pos = 0
            sent = ' '.join([str(word) for word in block_sent])
            sentences.append(sent)
            ent = {'entities': block_ents}
            train_point = (sent, ent)
            data_processed.append(train_point)

            block_sent = []
            block_ents = []

        try:
            block_sent.append(row[1].tokens)
            if row[1].NER != 'O':
                block_ents.append((current_pos, current_pos + len(row[1].tokens), row[1].NER))
            current_pos = len(row[1].tokens) + current_pos + 1

        except Exception as e:
            continue

        round += 1
    return data_processed

In [46]:
def get_batches(l, n):
    for i in range(0, len(l), n):
            yield l[i:i + n]

In [188]:
def train_spacy(data, iterations):
    TRAIN_DATA = data

    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        blank_model = False
    else:
        nlp = spacy.blank('de')
        print("Created blank 'de' model")
        blank_model = True
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if blank_model:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.create_optimizer()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = get_batches(TRAIN_DATA, batch_size)
            for batch in batches:
                for text, annotations in tqdm(batch):
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example],
                        drop=0.2,
                        sgd=optimizer,
                        losses=losses)
            print("losses", losses)
    return nlp

In [164]:
# Transform output of model to format that perl accepts token - gold_label - None - pred_label - None
# Don't forget to add I-/ B- to as prefix
def create_perl_comp_data(test, test_clean):

    nlp_new = spacy.load(output_dir)
    entities_map = {}
    tokens = test_clean['tokens'].tolist()
    re_mapped_entities = []
    gold_labels = test_clean['NER'].tolist()
    fill1 = []
    fill2 = []
    bnum = 0
    blen = round(len(test)/200)

    print('Started transformation for perl evaluation')
    batches = get_batches(test, batch_size)
    for batch in batches:
        bnum += 1
        print(f'Batch number {bnum} of {blen}')
        for txt, _ in batch:
            doc = nlp_new(txt)
            for ent in doc.ents:
                ent_map = {ent.text: ent.label_}
                entities_map.update(ent_map)

    for token in tokens:
        fill1.append('O')
        fill2.append('O')
        if token in entities_map.keys():
            ner = entities_map.get(token)
            if ner not in re_mapped_entities:
                ent = 'B-' + ner
                re_mapped_entities.append(ent)
            else:
                ent = 'I-' + ner
                re_mapped_entities.append(ent)
        else:
            re_mapped_entities.append('O')

    re_mapped_entities = [re.sub(r'(\S+)MISC\w*', r'\1OTH', ent) for ent in re_mapped_entities]

    print('Finished transformation')

    # Test if empty columns are required
    out = list(zip(tokens, gold_labels,fill1, re_mapped_entities, fill2))
    df = pd.DataFrame(out, columns = ['entity', 'gold_label', 'filler1', 'pred_label', 'filler2'])
    df.drop(0)

    df.to_csv(f'eval_outputs/spacy_gerNER_eval_{batch_size}_{iterations}_{model}.tsv', sep="\t")

    return df, entities_map, re_mapped_entities, tokens


# Main function

## Process input files

In [160]:
train = build_model_data(train_raw)
dev = build_model_data(dev_raw)
test = build_model_data(test_raw)

## Train model

In [189]:
random.seed(1)

n = 5000
sample = random.sample(train, n)

nlp = train_spacy(sample, iterations)

# Training of all 29.000 rows took about ~22m (batch_size=200, itn=2)
# 53m with batch_size=400 and itn=5

# ~10min Training blank model with batch_size = 400 and itn=5

Created blank 'de' model
Starting iteration 0


100%|██████████| 400/400 [00:21<00:00, 18.22it/s]
100%|██████████| 400/400 [00:13<00:00, 28.71it/s]
100%|██████████| 400/400 [00:18<00:00, 22.04it/s]
100%|██████████| 400/400 [00:23<00:00, 17.00it/s]
100%|██████████| 400/400 [00:18<00:00, 21.63it/s]
100%|██████████| 400/400 [00:20<00:00, 19.97it/s]
5	Selection	I-OTH	O
6	11..." with entities "[(11, 16, 'MISC'), (17, 62, 'MISC'), (80, 84, 'ORG...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
100%|██████████| 400/400 [00:21<00:00, 18.43it/s]
100%|██████████| 400/400 [00:28<00:00, 14.09it/s]
100%|██████████| 400/400 [00:07<00:00, 56.31it/s]
100%|██████████| 400/400 [00:27<00:00, 14.70it/s]
100%|██████████| 400/400 [00:20<00:00, 19.25it/s]
100%|██████████| 400/400 [00:17<00:00, 22.96it/s]
100%|██████████| 200/200 [00:06<00:00, 29.79it/s]


losses {'ner': 13484.674697839559}
Starting iteration 1


100%|██████████| 400/400 [00:29<00:00, 13.52it/s]
100%|██████████| 400/400 [00:25<00:00, 15.71it/s]
100%|██████████| 400/400 [00:18<00:00, 21.46it/s]
100%|██████████| 400/400 [00:12<00:00, 32.98it/s]
100%|██████████| 400/400 [00:11<00:00, 35.28it/s]
100%|██████████| 400/400 [00:18<00:00, 21.16it/s]
100%|██████████| 400/400 [00:21<00:00, 18.95it/s]
100%|██████████| 400/400 [00:13<00:00, 30.02it/s]
100%|██████████| 400/400 [00:26<00:00, 15.24it/s]
100%|██████████| 400/400 [00:08<00:00, 49.92it/s]
100%|██████████| 400/400 [00:26<00:00, 15.08it/s]
100%|██████████| 400/400 [00:21<00:00, 18.32it/s]
100%|██████████| 200/200 [00:13<00:00, 15.05it/s]

losses {'ner': 10937.860042865592}





In [191]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to spacy_gerNER_updated_400_2_None


## Prepare output

In [192]:
test_clean = clean_test_data(test_raw)

df, entities_map, re_mapped_entities, tokens = create_perl_comp_data(test[:100], test_clean[:100])

# Execution time on full test set ~8m

Started transformation for perl evaluation
Batch number 1 of 0
Finished transformation


In [181]:
df.head(50)

Unnamed: 0,entity,gold_label,filler1,pred_label,filler2
0,1951,O,O,O,O
1,bis,O,O,O,O
2,1953,O,O,O,O
3,wurde,O,O,O,O
4,der,O,O,O,O
5,nördliche,O,O,O,O
6,Teil,O,O,O,O
7,als,O,O,O,O
8,Jugendburg,O,O,O,O
9,des,O,O,O,O


# Evaluation

In [149]:
!pip install perl

8836.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting perl
  Downloading perl-1.0.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: perl
  Building wheel for perl (setup.py) ... [?25ldone
[?25h  Created wheel for perl: filename=perl-1.0.0-py3-none-any.whl size=10092 sha256=46480390d5a0e3ca7e536e497da603c19d96737c1594ee50f5a7b270b7a7f71c
  Stored in directory: /Users/kevinkraus/Library/Caches/pip/wheels/5c/d8/42/910871dc993f324c8faaaa5ca2b283ee305303e17368ccf153
Successfully built perl
Installing collected packages: perl
Successfully installed perl-1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [193]:
!perl nereval.perl < eval_outputs/spacy_gerNER_eval_200_2_de_core_news_sm.tsv

16329.57s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


unexpected number of features: 4 (6)
