# Install relevant packages

In [18]:
!pip install torch torchvision Flask spacy perl
!python -m spacy download de_core_news_sm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# Top level config

In [426]:
model = 'de_core_news_sm'
#model = None

batch_size = 400

iterations = 5

output_dir = f'./spacy_gerNER_updated_{batch_size}_{iterations}_{model}'

# Imports

In [427]:
import pandas as pd
import numpy as np
import spacy
import random
import re

from spacy.training import Example
from pathlib import Path
from tqdm import tqdm

# Read data

In [428]:
train_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-train.tsv', sep='\t', quoting = 3, names=['tokens', 'NER', 'delete'])
dev_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-dev.tsv',sep='\t', names=['tokens', 'NER', 'delete'])
test_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-test.tsv',sep='\t', quoting = 3, names=['tokens', 'NER', 'delete'], on_bad_lines='skip', engine='python')

# Methods

In [429]:
def clean_test_data(data):

  data = data[data.index != '#']

  data = data.iloc[:, :-1]

  return data

In [430]:
def build_model_data(data):

    data = data[data.index != '#']

    data = data.iloc[:, :-1]

    data['NER'] = data['NER'].str.replace(r'\S+PER\w*', 'PER', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+LOC\w*', 'LOC', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+ORG\w*', 'ORG', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+OTH\w*', 'MISC', regex=True)



    data_processed = []
    sentences = []
    block_sent = []
    block_ents = []

    round = 0
    current_pos = 0

    for row in data.iterrows():
        if int(row[0]) == 1 and round != 0:
            current_pos = 0
            sent = ' '.join([str(word) for word in block_sent])
            sentences.append(sent)
            ent = {'entities': block_ents}
            train_point = (sent, ent)
            data_processed.append(train_point)

            block_sent = []
            block_ents = []

        try:
            block_sent.append(row[1].tokens)
            if row[1].NER != 'O':
                block_ents.append((current_pos, current_pos + len(row[1].tokens), row[1].NER))
            current_pos = len(row[1].tokens) + current_pos + 1

        except Exception as e:
            continue

        round += 1
    return data_processed

In [431]:
def get_batches(l, n):
    for i in range(0, len(l), n):
            yield l[i:i + n]

In [432]:
def train_spacy(data, iterations):
    TRAIN_DATA = data

    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        blank_model = False
    else:
        nlp = spacy.blank('de')
        print("Created blank 'de' model")
        blank_model = True
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if blank_model:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.create_optimizer()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = get_batches(TRAIN_DATA, batch_size)
            for batch in batches:
                for text, annotations in tqdm(batch):
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example],
                        drop=0.2,
                        sgd=optimizer,
                        losses=losses)
            print("losses", losses)
    return nlp

In [433]:
# Transform output of model to format that perl accepts token - gold_label - None - pred_label - None
# Don't forget to add I-/ B- to as prefix
def create_perl_comp_data(test, test_clean):

    nlp_new = spacy.load(output_dir)
    entities_map = {}
    tokens = test_clean['tokens'].tolist()
    re_mapped_entities = []
    gold_labels = test_clean['NER'].tolist()
    fill1 = []
    fill2 = []
    bnum = 0
    blen = round(len(test)/200)

    print('Started transformation for perl evaluation')
    batches = get_batches(test, batch_size)
    for batch in batches:
        bnum += 1
        print(f'Batch number {bnum} of {blen}')
        for txt, _ in batch:
            doc = nlp_new(txt)
            for ent in doc.ents:
                ent_map = {ent.text: ent.label_}
                entities_map.update(ent_map)

    for token in tokens:
        fill1.append('O')
        fill2.append('O')
        if token in entities_map.keys():
            ner = entities_map.get(token)
            if ner not in re_mapped_entities:
                ent = 'B-' + ner
                re_mapped_entities.append(ent)
            else:
                ent = 'I-' + ner
                re_mapped_entities.append(ent)
        else:
            re_mapped_entities.append('O')

    re_mapped_entities = [re.sub(r'(\S+)MISC\w*', r'\1OTH', ent) for ent in re_mapped_entities]

    print('Finished transformation')

    # Test if empty columns are required
    #out = list(zip(tokens, gold_labels,fill1, re_mapped_entities, fill2))
   # df = pd.DataFrame(out, columns = ['entity', 'gold_label', 'filler1', 'pred_label', 'filler2'])
    out = list(zip(tokens, gold_labels, fill2, re_mapped_entities, fill1))
    df = pd.DataFrame(out)
    df.drop(0)

    df.to_csv(f'eval_outputs/spacy_gerNER_eval_{batch_size}_{iterations}_{model}.tsv', sep="\t")
    print(f'Saved evaluation file to: eval_outputs/spacy_gerNER_eval_{batch_size}_{iterations}_{model}.tsv')

    return df, entities_map, re_mapped_entities, tokens


# Main function

## Process input files

In [434]:
train = build_model_data(train_raw)
dev = build_model_data(dev_raw)
test = build_model_data(test_raw)

## Train model

In [435]:
random.seed(1)

n = 5000
sample = random.sample(train, n)

nlp = train_spacy(sample, iterations)

# Training of all 29.000 rows took about ~22m (batch_size=200, itn=2)
# 53m with batch_size=400 and itn=5

# ~10min Training blank model with batch_size = 400 and itn=5

Loaded model 'de_core_news_sm'
Starting iteration 0


100%|██████████| 400/400 [00:07<00:00, 56.21it/s]
100%|██████████| 400/400 [00:06<00:00, 57.41it/s]
100%|██████████| 400/400 [00:06<00:00, 58.03it/s]
100%|██████████| 400/400 [00:07<00:00, 57.10it/s]
100%|██████████| 400/400 [00:07<00:00, 55.68it/s]
100%|██████████| 400/400 [00:07<00:00, 50.95it/s]
100%|██████████| 400/400 [00:07<00:00, 53.76it/s]
100%|██████████| 400/400 [00:07<00:00, 56.56it/s]
100%|██████████| 400/400 [00:06<00:00, 58.29it/s]
100%|██████████| 400/400 [00:07<00:00, 56.80it/s]
100%|██████████| 400/400 [00:07<00:00, 56.73it/s]
 63%|██████▎   | 252/400 [00:04<00:02, 56.16it/s]

In [191]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to spacy_gerNER_updated_400_2_None


## Prepare output

In [420]:
test_clean = clean_test_data(test_raw)

df, entities_map, re_mapped_entities, tokens = create_perl_comp_data(test, test_clean)

# Execution time on full test set ~8m

Started transformation for perl evaluation
Batch number 1 of 15
Batch number 2 of 15
Batch number 3 of 15
Batch number 4 of 15
Batch number 5 of 15
Batch number 6 of 15
Batch number 7 of 15
Batch number 8 of 15
Finished transformation
Saved evaluation file to: eval_outputs/spacy_gerNER_eval_400_2_None.tsv


# Evaluation

In [None]:
!perl nereval.perl < eval_outputs/spacy_gerNER_eval_400_5_de_core_news_sm.tsv

In [425]:
!perl nereval.perl < eval_outputs/spacy_gerNER_eval_400_5_None.tsv


22058.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


STRICT: Found: 11132 outer and 0 inner phrases; Gold: 6178 (outer) and 0 (inner).
LOOSE: Found: 11132 outer and 0 inner phrases; Gold: 6178 (outer) and 0 (inner).

1. Strict, Combined Evaluation (official):
Accuracy:  92.63%;
Precision:  10.09%;
Recall:  18.18%;
FB1:  12.98

2. Loose, Combined Evaluation:
Accuracy:  92.74%;
Precision:  11.99%;
Recall:  21.61%;
FB1:  15.42

3.1 Per-Level Evaluation (outer chunks):
Accuracy:  85.25%;
Precision:  10.09%;
Recall:  18.18%;
FB1:  12.98

3.2 Per-Level Global Evaluation (inner chunks):
Accuracy: 100.00%;
Precision:   0.00%;
Recall:   0.00%;
FB1:   0.00


Evaluation per type and mode:

==>        LOC
Outer strict: Precision:   8.92%; Recall:  34.06%; FB1:  14.14
Inner strict: Precision:   0.00%; Recall:   0.00%; FB1:   0.00
Outer loose: Precision:  11.81%; Recall:  32.37%; FB1:  17.31
Inner loose: Precision:   0.00%; Recall:   0.00%; FB1:   0.00
==>   LOCderiv
Outer strict: Precision:   0.00%; Recall:   0.00%; FB1:   0.00
Inner strict: Precisio