# This is a Jupyter notebook for testing how well the [NER model](https://huggingface.co/dbmdz/flair-historic-ner-onb) for historic German performs on our Travelogues corpus.
## Please note the comments that are given in each cell.


At first, please try downloading the flair and other packages that are needed for using this model.

In [1]:
import glob
import os
import pprint
import re
import json
import ntpath

from flair.data import Sentence
from flair.models import SequenceTagger
import spacy
from spacy.tokens import Doc

from typing import List, Dict

from correct_ocr import single_characters, delete_specials, correct_s

In [2]:
METADATA_PATH: str = '../data/metadata/'
VALUES: List[str] = ['Title', 'Contributor']

The following cell is only needed if problems with spacy.load occur.

In [None]:
import spacy.cli
spacy.cli.download("de_core_news_md")

This part will try out the NER on sentences from the Travelogues texts.

In [3]:
# Load NER model into flair tagger.
tagger: SequenceTagger = SequenceTagger.load("dbmdz/flair-historic-ner-onb")



2022-12-08 07:57:27,205 loading file /Users/sarahreb/.flair/models/flair-historic-ner-onb/63111d37e8f19b08b01200ec38cd2b093d72026e56bbe99a7b25b6e3f8b7da8d.d53b1d9a206921442955a318ba5bbef2af5aabb93c4713d1ed3b8fe8c28cda3f
2022-12-08 07:57:34,296 SequenceTagger predicts: Dictionary with 16 tags: <unk>, O, S-PER, S-LOC, B-PER, E-PER, S-ORG, B-LOC, E-LOC, I-PER, B-ORG, E-ORG, I-LOC, I-ORG, <START>, <STOP>


In [None]:
# Downloading language model for the spacy pipeline
nlp = spacy.load("de_core_news_md")

# Read in files – can be noisy OCR
file: str = open('../data/test/bossmann_gvinea_1708.txt', 'r').read()[100:2000]

# Corrections as implemented by @Lisa Braune

file = re.sub('aͤ', 'ä', file)
file = re.sub('uͤ', 'ü', file)
file = re.sub('oͤ', 'ö', file)

file = correct_s(file)
file = single_characters(file)
file = delete_specials(file)



# Throw document into spacy pipeline, sentencise file
doc: Doc = nlp(file)
sents: list = [sent.text for sent in doc.sents]

for sent in sents:
    # Transform each sentence in the list into type Sentence for function availability
    sentence = Sentence(sent)
    tagger.predict(sentence)
    print(sentence.to_tagged_string())


In [None]:
sentence = Sentence(file)
tagger.predict(sentence)
print(sentence.to_dict())


#### NER on Travelogues titles

In [32]:
def read_jsons(metadata_path: str, single_file: bool = False, doc_barcode: str = 'Z124117102', values=None) -> Dict:
    """Function that aims to retrieve certain metadata information from the JSON files. Can handle single and multiple file attempts.

    :param metadata_path: Path for metadata files.
    :param single_file: Boolean, False if all files in metadata directory should be observed, True if only a single file should be parsed.
    :param doc_barcode: If a single file is parsed, the barcode must be provided.
    :param values: List of tags for the information value someone wants to extract, e.g. the title or author of a file.
    :return: String of the metadata information.
    """

    output_dict = {}

    if values is None:
        values = ['Title', 'Contributor']

    if single_file:
        indicator = doc_barcode
    else:
        indicator = '*'

    for metadata_file in glob.glob(metadata_path + indicator + '.json')[:10]:
        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {}
        with open(metadata_file, 'r') as f:
            object_dict = json.load(f)

            for metadata_dict in object_dict:
                try:
                    if metadata_dict['label'][0]['@value'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))][metadata_dict['label'][0]['@value']] = metadata_dict['value']
                        # print(f"{metadata_dict['label'][0]['@value']}: {metadata_dict['value']}")
                except TypeError:
                    if metadata_dict['label'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {metadata_dict['label'][0]['@value']: metadata_dict['value']}
                        # print(f"{metadata_dict['label']}: {metadata_dict['value']}")

    return output_dict


In [33]:
travelogues_titles: dict = read_jsons(metadata_path=METADATA_PATH)

In [45]:
def ner_tagged_jsons(titles: dict, tag: str, ner_model: tagger) -> None:
    """

    :param titles:
    :param tag:
    :param ner_model:
    :return:
    """

    for barcode, items in titles.items():
        print(f"Predicting Named Entities in {barcode}.")
        info: dict = {"type": "FeatureCollection", "features": []}
        title_sentence = Sentence(items[tag])
        ner_model.predict(title_sentence)
        for entity in title_sentence.get_spans('ner'):
            if entity.get_label("ner").value == 'LOC':
                feature_dict = {
                    "type": "Feature", "properties": {},
                    "geometry": {
                        "type" : "Point", "coordinates" : []
                    }
                }
                feature_dict["properties"]["source_label"] = entity.text
                feature_dict["properties"]["start_position"] = entity.start_position
                feature_dict["properties"]["end_position"] = entity.end_position
                info["features"].append(feature_dict)
                #info[barcode]["score"] = entity.get_label("ner").score

        json_dump = json.dumps(info, indent=4)
        with open('../data/ner_tagged_jsons/' + barcode + '.json', 'w') as f:
            f.write(json_dump)

In [46]:
ner_tagged_jsons(titles=travelogues_titles, tag='Title', ner_model=tagger)

Predicting Named Entities in Z203131903.
Predicting Named Entities in Z16446480X.
Predicting Named Entities in Z98088202.
Predicting Named Entities in Z69858209.
Predicting Named Entities in Z186417204.
Predicting Named Entities in Z257372603.
Predicting Named Entities in Z204099201.
Predicting Named Entities in Z16005920X.
Predicting Named Entities in Z203234200.
Predicting Named Entities in Z221813609.
