# Install relevant packages

In [18]:
!pip install torch torchvision Flask spacy
!python -m spacy download de_core_news_sm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# Top level config

In [171]:
model = 'de_core_news_sm'

batch_size = 400

iterations = 5

output_dir = f'./spacy_gerNER_updated_{batch_size}_{iterations}'

# Imports

In [96]:
import pandas as pd
import numpy as np
import spacy
import random
import re

from spacy.training import Example
from pathlib import Path
from tqdm import tqdm

# Read data

In [101]:
train_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-train.tsv', sep='\t', on_bad_lines='skip', engine='python',  names=['tokens', 'NER', 'delete'])
dev_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-dev.tsv',sep='\t', names=['tokens', 'NER', 'delete'], on_bad_lines='skip')
test_raw = pd.read_csv('data/Assignment-Option-1-GermaNER-data/NER-de-test.tsv',sep='\t', names=['tokens', 'NER', 'delete'], on_bad_lines='skip', engine='python')

# Methods

In [122]:
def clean_test_data(data):

  data = data[data.index != '#']

  data = data.iloc[:, :-1]

  return data

In [123]:
def build_model_data(data):

    data = data[data.index != '#']

    data = data.iloc[:, :-1]

    data['NER'] = data['NER'].str.replace(r'\S+PER\w*', 'PER', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+LOC\w*', 'LOC', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+ORG\w*', 'ORG', regex=True)
    data['NER'] = data['NER'].str.replace(r'\S+OTH\w*', 'MISC', regex=True)

    data_processed = []
    sentences = []
    block_sent = []
    block_ents = []

    round = 0
    current_pos = 0

    for row in data.iterrows():
        if int(row[0]) == 1 and round != 0:
            current_pos = 0
            sent = ' '.join([str(word) for word in block_sent])
            sentences.append(sent)
            ent = {'entities': block_ents}
            train_point = (sent, ent)
            data_processed.append(train_point)

            block_sent = []
            block_ents = []

        try:
            block_sent.append(row[1].tokens)
            if row[1].NER != 'O':
                block_ents.append((current_pos, current_pos + len(row[1].tokens), row[1].NER))
            current_pos = len(row[1].tokens) + current_pos + 1

        except Exception as e:
            continue

        round += 1
    return data_processed

In [46]:
def get_batches(l, n):
    for i in range(0, len(l), n):
            yield l[i:i + n]

In [77]:
def train_spacy(data, iterations):
    TRAIN_DATA = data

    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        blank_model = False
    else:
        nlp = spacy.blank('de')
        print("Created blank 'de' model")
        blank_model = True
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if blank_model:
            optimizer = nlp.begin_training
        else:
            optimizer = nlp.create_optimizer()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = get_batches(TRAIN_DATA, batch_size)
            for batch in batches:
                for text, annotations in tqdm(batch):
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example],
                        drop=0.2,
                        sgd=optimizer,
                        losses=losses)
            print("losses", losses)
    return nlp

In [164]:
# Transform output of model to format that perl accepts token - gold_label - None - pred_label - None
# Don't forget to add I-/ B- to as prefix
def create_perl_comp_data(test, test_clean):

    nlp_new = spacy.load(output_dir)
    entities_map = {}
    tokens = test_clean['tokens'].tolist()
    re_mapped_entities = []
    gold_labels = test_clean['NER'].tolist()
    fill1 = []
    fill2 = []
    bnum = 0
    blen = round(len(test)/200)

    print('Started transformation for perl evaluation')
    batches = get_batches(test, batch_size)
    for batch in batches:
        bnum += 1
        print(f'Batch number {bnum} of {blen}')
        for txt, _ in batch:
            doc = nlp_new(txt)
            for ent in doc.ents:
                ent_map = {ent.text: ent.label_}
                entities_map.update(ent_map)

    for token in tokens:
        fill1.append('O')
        fill2.append('O')
        if token in entities_map.keys():
            ner = entities_map.get(token)
            if ner not in re_mapped_entities:
                ent = 'B-' + ner
                re_mapped_entities.append(ent)
            else:
                ent = 'I-' + ner
                re_mapped_entities.append(ent)
        else:
            re_mapped_entities.append('O')

    re_mapped_entities = [re.sub(r'(\S+)MISC\w*', r'\1OTH', ent) for ent in re_mapped_entities]

    print('Finished transformation')

    # Test if empty columns are required
    out = list(zip(tokens, gold_labels,fill1, re_mapped_entities, fill2))
    df = pd.DataFrame(out, columns = ['entity', 'gold_label', 'filler1', 'pred_label', 'filler2'])
    df.drop(0)

    df.to_csv(f'eval_outputs/spacy_gerNER_eval_{batch_size}_{iterations}_{model}.tsv', sep="\t")

    return df, entities_map, re_mapped_entities, tokens


# Main function

## Process input files

In [160]:
train = build_model_data(train_raw)
dev = build_model_data(dev_raw)
test = build_model_data(test_raw)

## Train model

In [172]:
nlp = train_spacy(train, iterations)

# Training of all 29.000 rows took about ~22m (batch_size=200, itn=2)
# 

Loaded model 'de_core_news_sm'
Starting iteration 0


100%|██████████| 400/400 [00:24<00:00, 16.47it/s]
100%|██████████| 400/400 [00:30<00:00, 13.18it/s]
100%|██████████| 400/400 [00:12<00:00, 32.32it/s]
100%|██████████| 400/400 [00:18<00:00, 21.50it/s]
100%|██████████| 400/400 [00:19<00:00, 20.25it/s]
100%|██████████| 400/400 [00:21<00:00, 18.68it/s]
100%|██████████| 400/400 [00:13<00:00, 30.71it/s]
100%|██████████| 400/400 [00:16<00:00, 24.90it/s]
100%|██████████| 400/400 [00:29<00:00, 13.75it/s]
100%|██████████| 400/400 [00:22<00:00, 17.73it/s]
100%|██████████| 400/400 [00:33<00:00, 12.05it/s]
100%|██████████| 400/400 [00:24<00:00, 16.27it/s]
100%|██████████| 400/400 [00:11<00:00, 33.60it/s]
100%|██████████| 400/400 [00:13<00:00, 29.56it/s]
100%|██████████| 400/400 [00:14<00:00, 28.30it/s]
100%|██████████| 400/400 [00:31<00:00, 12.60it/s]
100%|██████████| 400/400 [00:40<00:00,  9.97it/s]
100%|██████████| 400/400 [00:25<00:00, 15.98it/s]
100%|██████████| 400/400 [00:32<00:00, 12.47it/s]
100%|██████████| 400/400 [00:17<00:00, 22.76it/s]


losses {'ner': 23217.816196028274}
Starting iteration 1


100%|██████████| 400/400 [00:26<00:00, 14.95it/s]
100%|██████████| 400/400 [00:23<00:00, 16.86it/s]
100%|██████████| 400/400 [00:07<00:00, 54.24it/s]
100%|██████████| 400/400 [00:17<00:00, 22.36it/s]
100%|██████████| 400/400 [00:15<00:00, 25.56it/s]
100%|██████████| 400/400 [00:32<00:00, 12.36it/s]
100%|██████████| 400/400 [00:30<00:00, 13.27it/s]
100%|██████████| 400/400 [00:10<00:00, 37.15it/s]
100%|██████████| 400/400 [00:21<00:00, 18.95it/s]
100%|██████████| 400/400 [00:43<00:00,  9.27it/s]
100%|██████████| 400/400 [00:31<00:00, 12.67it/s]
100%|██████████| 400/400 [00:11<00:00, 34.00it/s]
100%|██████████| 400/400 [00:10<00:00, 37.41it/s]
100%|██████████| 400/400 [00:23<00:00, 17.08it/s]
100%|██████████| 400/400 [00:15<00:00, 25.44it/s]
100%|██████████| 400/400 [00:23<00:00, 16.92it/s]
100%|██████████| 400/400 [00:24<00:00, 16.34it/s]
100%|██████████| 400/400 [00:26<00:00, 15.23it/s]
100%|██████████| 400/400 [00:19<00:00, 21.02it/s]
100%|██████████| 400/400 [00:21<00:00, 18.26it/s]


losses {'ner': 19712.838025730918}
Starting iteration 2


100%|██████████| 400/400 [00:16<00:00, 24.23it/s]
100%|██████████| 400/400 [00:45<00:00,  8.85it/s]
100%|██████████| 400/400 [00:12<00:00, 32.58it/s]
100%|██████████| 400/400 [00:11<00:00, 33.64it/s]
100%|██████████| 400/400 [00:19<00:00, 20.55it/s]
100%|██████████| 400/400 [00:14<00:00, 26.82it/s]
100%|██████████| 400/400 [00:42<00:00,  9.31it/s]
100%|██████████| 400/400 [00:14<00:00, 28.32it/s]
100%|██████████| 400/400 [00:30<00:00, 13.03it/s]
100%|██████████| 400/400 [00:26<00:00, 15.36it/s]
100%|██████████| 400/400 [00:27<00:00, 14.80it/s]
100%|██████████| 400/400 [00:19<00:00, 20.90it/s]
100%|██████████| 400/400 [00:22<00:00, 17.94it/s]
100%|██████████| 400/400 [00:19<00:00, 21.02it/s]
100%|██████████| 400/400 [00:19<00:00, 20.43it/s]
100%|██████████| 400/400 [00:23<00:00, 16.67it/s]
100%|██████████| 400/400 [00:14<00:00, 28.03it/s]
100%|██████████| 400/400 [00:13<00:00, 30.27it/s]
100%|██████████| 400/400 [00:19<00:00, 20.61it/s]
100%|██████████| 400/400 [00:11<00:00, 34.44it/s]


In [162]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to spacy_gerNER_updated


## Prepare output

In [167]:
test_clean = clean_test_data(test_raw)

df, entities_map, re_mapped_entities, tokens = create_perl_comp_data(test, test_clean)

# Execution time on full test set ~8m

Started transformation for perl evaluation
Batch number 1 of 15
Batch number 2 of 15
Batch number 3 of 15
Batch number 4 of 15
Batch number 5 of 15
Batch number 6 of 15
Batch number 7 of 15
Batch number 8 of 15
Batch number 9 of 15
Batch number 10 of 15
Batch number 11 of 15
Batch number 12 of 15
Batch number 13 of 15
Batch number 14 of 15
Batch number 15 of 15
Finished transformation


# Evaluation

In [149]:
!pip install perl

8836.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting perl
  Downloading perl-1.0.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: perl
  Building wheel for perl (setup.py) ... [?25ldone
[?25h  Created wheel for perl: filename=perl-1.0.0-py3-none-any.whl size=10092 sha256=46480390d5a0e3ca7e536e497da603c19d96737c1594ee50f5a7b270b7a7f71c
  Stored in directory: /Users/kevinkraus/Library/Caches/pip/wheels/5c/d8/42/910871dc993f324c8faaaa5ca2b283ee305303e17368ccf153
Successfully built perl
Installing collected packages: perl
Successfully installed perl-1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [169]:
!perl nereval.perl < eval_outputs/spacy_gerNER_eval_200_2_de_core_news_sm.tsv

11457.24s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


unexpected number of features: 4 (6)
