# This is a Jupyter notebook for testing how well the [NER model](https://huggingface.co/dbmdz/flair-historic-ner-onb) for historic German performs on our Travelogues corpus.
## Please note the comments that are given in each cell.


At first, please try downloading the flair and other packages that are needed for using this model.

In [1]:
import glob
import os.path
import re
import json
import ntpath

import nltk
import geocoder

from typing import List, Dict

from correct_ocr import single_characters, delete_specials, correct_s

from flair.data import Sentence
from flair.models import SequenceTagger

In [2]:
METADATA_PATH: str = '../data/metadata/'
VALUES: List[str] = ['Title', 'Contributor']

This part will predict NE on sentences from the Travelogues texts.

In [None]:
# Load NER model into flair tagger.
tagger: SequenceTagger = SequenceTagger.load("dbmdz/flair-historic-ner-onb")

In [None]:
# Read in files – can be noisy OCR
nr_of_files: int = 5
i = 0

for files in glob.glob('../data/18th_century_first_quarter_corr/*.txt')[:]:
    if os.path.exists('../data/output/text_ner/' + os.path.basename(re.sub('.txt', '.json', f'{files}'))) and i < nr_of_files:
        continue
    elif i == nr_of_files:
        print(f"Reached limit of {nr_of_files} files.")
        break
    else:
        print(f"Working on: {files}. It is the {i + 1}th file.")
        file: str = open(files, 'r').read()[:]

        # Corrections as implemented by @Lisa Braune

        sents = nltk.sent_tokenize(file, language='german')

        # Sentence tokeniser with nltk
        info: dict = {"type": "FeatureCollection", "features": []}
        print(f"Now predicting Named Entities in {files}.")
        idx = 0
        for sent in sents:
            sent = Sentence(sent)
            tagger.predict(sent)
            print(sent.to_tagged_string())
            for entity in sent.get_spans('ner'):
                if entity.get_label("ner").value == 'LOC':
                    feature_dict = {
                        "type": "Feature", "properties": {},
                        "geometry": {
                            "type": "Point", "coordinates": []
                        }
                    }
                    feature_dict["properties"]["source_label"] = entity.text
                    g = geocoder.geonames(entity.text, key='sarahondraszek', featureClass='A')
                    g_id = g.geonames_id
                    g = geocoder.geonames(g_id, key='sarahondraszek', method='details')
                    feature_dict["geometry"]["coordinates"] = [int(g.lng), int(g.lat)]
                    feature_dict["properties"]["sentence_idx"] = idx
                    feature_dict["properties"]["start_position"] = entity.start_position
                    feature_dict["properties"]["end_position"] = entity.end_position
                    info["features"].append(feature_dict)
                    #info[barcode]["score"] = entity.get_label("ner").score
            idx += 1

        json_dump = json.dumps(info, indent=4)
        with open('../data/output/text_ner/' + os.path.basename(re.sub('.txt', '.json', f'{files}')), 'w') as f:
            f.write(json_dump)
        i += 1


#### NER on Travelogues titles

In [22]:
def read_jsons(metadata_path: str, single_file: bool = False, doc_barcode: str = 'Z124117102', values=None) -> Dict:
    """Function that aims to retrieve certain metadata information from the JSON files. Can handle single and multiple file attempts.

    :param metadata_path: Path for metadata files.
    :param single_file: Boolean, False if all files in metadata directory should be observed, True if only a single file should be parsed.
    :param doc_barcode: If a single file is parsed, the barcode must be provided.
    :param values: List of tags for the information value someone wants to extract, e.g. the title or author of a file.
    :return: String of the metadata information.
    """

    output_dict = {}

    if values is None:
        values = ['Title', 'Contributor']

    if single_file:
        indicator = doc_barcode
    else:
        indicator = '*'

    for metadata_file in glob.glob(metadata_path + indicator + '.json')[:]:
        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {}
        with open(metadata_file, 'r') as f:
            object_dict = json.load(f)

            for metadata_dict in object_dict:
                try:
                    if metadata_dict['label'][0]['@value'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))][
                            metadata_dict['label'][0]['@value']] = metadata_dict['value']
                        # print(f"{metadata_dict['label'][0]['@value']}: {metadata_dict['value']}")
                except TypeError:
                    if metadata_dict['label'] in values:
                        output_dict[re.sub(r'\.json', '', ntpath.basename(metadata_file))] = {
                            metadata_dict['label'][0]['@value']: metadata_dict['value']}
                        # print(f"{metadata_dict['label']}: {metadata_dict['value']}")

    return output_dict


In [23]:
travelogues_titles: dict = read_jsons(metadata_path=METADATA_PATH)

In [24]:
def ner_tagged_jsons(titles: dict, tag: str, ner_model: tagger) -> None:
    """

    :param titles:
    :param tag:
    :param ner_model:
    :return:
    """

    for barcode, items in titles.items():
        print(f"Predicting Named Entities in {barcode}.")
        info: dict = {"type": "FeatureCollection", "features": []}
        title_sentence = (items[tag])
        ner_model.predict(title_sentence)
        for entity in title_sentence.get_spans('ner'):
            if entity.get_label("ner").value == 'LOC':
                feature_dict = {
                    "type": "Feature", "properties": {},
                    "geometry": {
                        "type": "Point", "coordinates": []
                    }
                }
                feature_dict["properties"]["source_label"] = entity.text
                g = geocoder.geonames(entity.text, key='sarahondraszek', featurClass='A')
                g_id = g.geonames_id
                g = geocoder.geonames(g_id, key='sarahondraszek', method='details')
                feature_dict["geometry"]["coordinates"] = [g.lng, g.lat]
                feature_dict["properties"]["start_position"] = entity.start_position
                feature_dict["properties"]["end_position"] = entity.end_position
                info["features"].append(feature_dict)
                #info[barcode]["score"] = entity.get_label("ner").score

        json_dump = json.dumps(info, indent=4)
        with open('../data/titles_ner_tagged_jsons/' + barcode + '.json', 'w') as f:
            f.write(json_dump)

In [None]:
ner_tagged_jsons(titles=travelogues_titles, tag='Title', ner_model=tagger)