In [1]:
import spacy
from spacy import displacy
import glob
from tqdm import tqdm
import json

In [2]:
with open('train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

In [3]:
with open('test_data.json', 'r', encoding='utf-8') as f:
    testing_data = json.load(f)

In [4]:
(testing_data[0])

{'text': 'Gute Köchin neben Stuben mädchen gesucht. 2. Bez., Unt. Donaustr. 29, Tür 11. 21856',
 'entities': [[5, 11, 'position']]}

In [5]:
from spacy.tokens import DocBin
from spacy.util import filter_spans

nlp = spacy.blank('de')

In [6]:
train_doc_bin = DocBin()

for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels: 
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    train_doc_bin.add(doc)

train_doc_bin.to_disk("train.spacy")

test_doc_bin = DocBin()

for training_example in tqdm(testing_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels: 
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    test_doc_bin.add(doc)

test_doc_bin.to_disk("test.spacy")

 42%|███████████████████████████████▉                                             | 617/1486 [00:00<00:00, 3054.33it/s]

Skipping entity
Skipping entity
Skipping entity


100%|████████████████████████████████████████████████████████████████████████████| 1486/1486 [00:00<00:00, 3278.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 637/637 [00:00<00:00, 3816.86it/s]

Skipping entity
Skipping entity
Skipping entity





In [7]:
## Generate a base_config file from spaCy's official documentation: https://spacy.io/usage/training

In [8]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python -m spacy train config.cfg --output ner --paths.train train.spacy --paths.dev test.spacy

[38;5;4m[i] Saving to output directory: ner[0m
[38;5;4m[i] Using CPU[0m
[38;5;4m[i] To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     34.50    5.90    3.10   60.38    0.06
  0     200         42.81   1236.22   77.06   83.29   71.70    0.77
  0     400         42.49    475.37   82.25   79.45   85.26    0.82
  1     600         73.37    366.62   83.27   80.22   86.56    0.83
  1     800        114.77    378.58   85.30   87.97   82.78    0.85
  2    1000        158.29    304.47   85.66   86.07   85.26    0.86
  3    1200        213.88    220.06   84.57   92.08   78.18    0.85
  4    1400        280.54    195.05   87.26   90.29   84.43    0.87
  6    1600        263.67    152.35   84.74

In [10]:
#nlp_ner = spacy.load('model-best_GPU') # GPU model
nlp_ner = spacy.load('ner/model-best') # CPU model

In [11]:
import requests
import re

def lemmatize(token):
    url = f'https://www.deutschestextarchiv.de/demo/cab/query?a=default&fmt=text&clean=1&pretty=1&raw=1&q={token}'
    response = requests.get(url)

    if response.status_code == 200:
        content = response.text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
    
    match = re.search(r'\[moot/lemma\] (\S+)', content)
    if match:
        lemma = match.group(1)
        return lemma
    else:
        print("The [moot/lemma] tag was not found.")

In [12]:
annotated_positions = []
predicted_positions = []

for entry in tqdm(testing_data):
    predicted_positions_row = []
    entities = nlp_ner(entry['text'])
    
    for entity in entities.ents:
        predicted_positions_row.append(str(entity.text))

    for i in range(len(predicted_positions_row)):
        if 'stelle' in predicted_positions_row[i]:
            predicted_positions_row[i] = predicted_positions_row[i][:-6]
        predicted_positions_row[i] = lemmatize(predicted_positions_row[i])
    predicted_positions.append(predicted_positions_row)
    
    annotated_positions_row = [entry['text'][start:end] for start, end, _ in entry['entities']]
    for i in range(len(annotated_positions_row)):
        if 'stelle' in annotated_positions_row[i]:
            annotated_positions_row[i] = annotated_positions_row[i][:-6]
        annotated_positions_row[i] = lemmatize(annotated_positions_row[i])
    
    annotated_positions.append(annotated_positions_row) 

100%|████████████████████████████████████████████████████████████████████████████████| 637/637 [21:48<00:00,  2.05s/it]


In [13]:
doc = nlp_ner('Eine Naturblumenbinderin gelernt in Wien, bittet um Kondition. Emma Ziegler, Gärtnergaſſe 7. Perchtoldsdarf. N.⸗Oc. 3123—6')
displacy.render(doc, style='ent', jupyter=True)

In [14]:
doc = nlp_ner("Fesche solide 12599 Kassierkellnerin und tüchtige Köchin finden sofort Stellung. Stadtparkrestaurant, Saaz.")
displacy.render(doc, style='ent', jupyter=True)

In [15]:
doc = nlp_ner("Avis f. deutsche stellensuchende. sofort werden plazirt nach Ungarn: 1 sekretär zu einem Grafen 1000 fl., 1 Güter⸗Inspektor 900 fl. und Tantiéme, 1 Oekonomiebeamter 800 fl. und Deputat, 2 Wirthschaftsadjunkten à 400 fl. pro anno und freie stazion, 1 Oberförster 1200 fl., 1 Magazineur 800 fl., 1 Buchhalter 1000 fl., 1 Brennereileiter 900 fl., 1 Braumeister 1000 fl., 1 Portier 700 fl., 1 ArbeitsAufseher 750 fl., 1 Fabriks⸗Aufseher in einer chemischen Fabrik 800 fl., 1 deutscher Erzieher 40 fl., 1 Reisebegleiterin 40 fl., 1 Hausrepräsentantin 35 fl. und 1 Gesellschafterin 30 fl. pro Monat und freie stazion, durch die Plazierungs⸗Agentur des B. Malík, Budapest, sommergasse 2. Anfragen werden nur gegen Einsendung von 3 stück Briefmarken beantwortet. 6540.")
displacy.render(doc, style='ent', jupyter=True)

In [16]:
doc = nlp_ner('Akkumulatorenfachmann,ſelbſtändiger Arbeiter, in Auto“, Radio. und Telephon⸗Arbeiten verſiert, ſucht ſeine Stelle zu verbeſſern. In⸗ oder Ausland. Sene und Tſchechiſch. 4 11. Angebote erbeten an Ernſt 114 Prag⸗Smichov, Nadraini 48²⁵')
displacy.render(doc, style='ent', jupyter=True)