# This is a Jupyter notebook for testing how well the [NER model](https://huggingface.co/dbmdz/flair-historic-ner-onb) for historic German performs on our Travelogues corpus.
## Please note the comments that are given in each cell.


At first, please try downloading the flair and other packages that are needed for using this model.

In [92]:
import glob
import pprint
import re
import json

from flair.data import Sentence
from flair.models import SequenceTagger
import spacy
from spacy.tokens import Doc

from typing import List

from correct_ocr import single_characters, delete_specials, correct_s

In [76]:
METADATA_PATH: str = '../data/metadata/'
VALUES: List[str] = ['Title', 'Contributor']

The following cell is only needed if problems with spacy.load occur.

In [None]:
import spacy.cli
spacy.cli.download("de_core_news_md")

This part will try out the NER on sentences from the Travelogues texts.

In [None]:
# Downloading language model for the spacy pipeline
nlp = spacy.load("de_core_news_md")

# Read in files – can be noisy OCR
file: str = open('../data/test/bossmann_gvinea_1708.txt', 'r').read()[100:2000]

# Corrections as implemented by @Lisa Braune

file = re.sub('aͤ', 'ä', file)
file = re.sub('uͤ', 'ü', file)
file = re.sub('oͤ', 'ö', file)

file = correct_s(file)
file = single_characters(file)
file = delete_specials(file)



# Throw document into spacy pipeline, sentencise file
doc: Doc = nlp(file)
sents: list = [sent.text for sent in doc.sents]

tagger: SequenceTagger = SequenceTagger.load("dbmdz/flair-historic-ner-onb")

for sent in sents:
    # Transform each sentence in the list into type Sentence for function availability
    sentence = Sentence(sent)
    tagger.predict(sentence)
    print(sentence.to_tagged_string())


In [None]:
sentence = Sentence(file)
tagger.predict(sentence)
print(sentence.to_dict())


#### NER on Travelogues titles

In [129]:
def read_jsons(metadata_path: str, single_file: bool = False, doc_barcode: str = 'Z124117102', values=None) -> List[str]:
    """Function that aims to retrieve certain metadata information from the JSON files. Can handle single and multiple file attempts.

    :param metadata_path: Path for metadata files.
    :param single_file: Boolean, False if all files in metadata directory should be observed, True if only a single file should be parsed.
    :param doc_barcode: If a single file is parsed, the barcode must be provided.
    :param values: List of tags for the information value someone wants to extract, e.g. the title or author of a file.
    :return: String of the metadata information.
    """

    output_dict = {}

    if values is None:
        values = ['Barcode', 'Title']

    if single_file:
        indicator = doc_barcode
    else:
        indicator = '*'

    for metadata_file in glob.glob(metadata_path + indicator + '.json')[:10]:
        with open(metadata_file, 'r') as f:
            object_dict = json.load(f)

            for metadata_dict in object_dict:
                try:
                    if metadata_dict['label'][0]['@value'] in values:
                        print(f"{metadata_dict['label'][0]['@value']}: {metadata_dict['value']}")
                except TypeError:
                    if metadata_dict['label'] in values:
                        print(f"{metadata_dict['label']}: {metadata_dict['value']}")

    return output_dict

In [None]:
read_jsons(metadata_path=METADATA_PATH)