# This is a Jupyter notebook for testing how well the [NER model](https://huggingface.co/dbmdz/flair-historic-ner-onb) for historic German performs on our Travelogues corpus.
## Please note the comments that are given in each cell.


At first, please try downloading the flair and other packages that are needed for using this model.

In [59]:
import glob
import os
import pprint
import re
import json
import ntpath

from flair.data import Sentence
from flair.models import SequenceTagger
import spacy
from spacy.tokens import Doc

from typing import List, Dict

from correct_ocr import single_characters, delete_specials, correct_s

In [60]:
METADATA_PATH: str = '../data/metadata/'
VALUES: List[str] = ['Title', 'Contributor']

The following cell is only needed if problems with spacy.load occur.

In [None]:
import spacy.cli
spacy.cli.download("de_core_news_md")

This part will try out the NER on sentences from the Travelogues texts.

In [None]:
# Load NER model into flair tagger.
tagger: SequenceTagger = SequenceTagger.load("dbmdz/flair-historic-ner-onb")

In [None]:
# Downloading language model for the spacy pipeline
nlp = spacy.load("de_core_news_md")

# Read in files – can be noisy OCR
file: str = open('../data/test/bossmann_gvinea_1708.txt', 'r').read()[100:2000]

# Corrections as implemented by @Lisa Braune

file = re.sub('aͤ', 'ä', file)
file = re.sub('uͤ', 'ü', file)
file = re.sub('oͤ', 'ö', file)

file = correct_s(file)
file = single_characters(file)
file = delete_specials(file)



# Throw document into spacy pipeline, sentencise file
doc: Doc = nlp(file)
sents: list = [sent.text for sent in doc.sents]

for sent in sents:
    # Transform each sentence in the list into type Sentence for function availability
    sentence = Sentence(sent)
    tagger.predict(sentence)
    print(sentence.to_tagged_string())


In [None]:
sentence = Sentence(file)
tagger.predict(sentence)
print(sentence.to_dict())


#### NER on Travelogues titles

In [82]:
def read_jsons(metadata_path: str, single_file: bool = False, doc_barcode: str = 'Z124117102', values=None) -> Dict:
    """Function that aims to retrieve certain metadata information from the JSON files. Can handle single and multiple file attempts.

    :param metadata_path: Path for metadata files.
    :param single_file: Boolean, False if all files in metadata directory should be observed, True if only a single file should be parsed.
    :param doc_barcode: If a single file is parsed, the barcode must be provided.
    :param values: List of tags for the information value someone wants to extract, e.g. the title or author of a file.
    :return: String of the metadata information.
    """

    output_dict = {}

    if values is None:
        values = ['Title', 'Contributor']

    if single_file:
        indicator = doc_barcode
    else:
        indicator = '*'

    for metadata_file in glob.glob(metadata_path + indicator + '.json')[:20]:
        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {}
        with open(metadata_file, 'r') as f:
            object_dict = json.load(f)

            for metadata_dict in object_dict:
                try:
                    if metadata_dict['label'][0]['@value'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))][metadata_dict['label'][0]['@value']] = metadata_dict['value']
                        # print(f"{metadata_dict['label'][0]['@value']}: {metadata_dict['value']}")
                except TypeError:
                    if metadata_dict['label'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {metadata_dict['label'][0]['@value']: metadata_dict['value']}
                        # print(f"{metadata_dict['label']}: {metadata_dict['value']}")

    return output_dict


In [83]:
travelogues_titles: dict = read_jsons(metadata_path=METADATA_PATH)

In [84]:
def ner_metadata(titles: dict, tag: str, ner_model: tagger) -> None:
    """

    :param titles:
    :param tag:
    :param ner_model:
    :return:
    """

    for barcode, items in titles.items():
        sentence = Sentence(items[tag])
        tagger.predict(sentence)
        print(barcode, sentence.get_labels())

In [85]:
ner_metadata(travelogues_titles, tag='Title', ner_model=tagger)

Z203131903 ['Span[0:1]: "Joseph"'/'PER' (0.7087), 'Span[7:8]: "Paris"'/'LOC' (0.9993)]
Z16446480X ['Span[4:5]: "Nordamerika"'/'LOC' (0.9999), 'Span[5:6]: ":"'/'LOC' (0.9738), 'Span[15:16]: "Kanada"'/'LOC' (0.9998), 'Span[17:18]: "Neu-England"'/'LOC' (1.0), 'Span[19:20]: "Virginien"'/'LOC' (0.9999), 'Span[38:39]: "Bourgoyne"'/'PER' (0.9997)]
Z98088202 []
Z69858209 ['Span[3:6]: "Philipp Fermins >>"'/'PER' (0.7115), 'Span[9:10]: "Surinam"'/'LOC' (0.9901)]
Z186417204 ['Span[8:9]: "Siebenbürgen"'/'LOC' (1.0), 'Span[11:12]: "Temeswarer"'/'LOC' (0.8545), 'Span[12:13]: "Banat"'/'LOC' (0.755), 'Span[14:15]: "Ungarn"'/'LOC' (0.9999), 'Span[16:17]: "Oesterreich"'/'LOC' (0.9999), 'Span[18:19]: "Bayern"'/'LOC' (0.9999), 'Span[20:21]: "Schwaben"'/'LOC' (0.9999), 'Span[22:23]: "Schweiz"'/'LOC' (0.9999), 'Span[24:25]: "Elsaß"'/'LOC' (0.9997), 'Span[30:32]: "Theodor Lange"'/'PER' (0.898)]
Z257372603 ['Span[0:1]: "Jenne"'/'PER' (0.786), 'Span[1:2]: "'s"'/'PER' (0.9592), 'Span[5:6]: "Spanien"'/'LOC' (1.0