# Name Extraction
* extract name from named entity results 
* extract names from name registries

In [None]:
import spacy

In [None]:
spacy_tools = spacy.load('de_core_news_lg')

In [None]:
def read(filename, one_line=False):
    with open(filename, 'r') as txt_in:
        if one_line:
            text = txt_in.read()
        else:
            text = [line.strip() for line in txt_in]
    return text

In [None]:
samples_dir = "data/OCR results/FR 14 Enterprise OCR results/"

In [None]:
sample_1_path = samples_dir + "1936_10_full.txt"
sample_1 = read(sample_1_path, one_line=True)

In [None]:
annotations_1 = spacy_tools(sample_1)

In [None]:
persons = []
PER = "PER"
for entity in annotations_1.ents:
    if entity.label_ == PER:
        persons.append((entity.text, entity.label_))

In [None]:
# clean named entities
# Manager, Avenue, Bank of, Str

# all upper
# delete - , but not always
not_in_name = ["\n", ","]
role_nouns = []

street_indicator = ["Avenue", "Str.", "Av.", "Ave."]
titles = ["Dr.", "Hr.", "Fr.", "Frl."]

In [None]:
names_only = [person[0] for person in persons]

In [None]:
from collections import Counter 

In [None]:
cnt = Counter(names_only)

In [None]:
cnt.most_common()

In [None]:
names_only_no_streets = [entry for entry in names_only if not entry.endswith("Str.")]

In [None]:
len(names_only_no_streets)

In [None]:
# simple case "firstname lastname"
names = {"title":[], "first name": [], "last name": [], "original": []}

clean_end = [",", "\n"]

#def simple_first_last()

for entry in names_only:
    if " " not in entry:
        names["original"].append(entry)
        names["last name"].append(entry)
    elif len(entry.split(" ")) == 2:
        entry = entry.split(" ")
        if entry[1] not in street_indicator:
            names["original"].append(" ".join(entry))
            if entry[0] in titles:
                names["title"].append(entry[0])
                names["first name"].append("-")
            else:
                names["title"].append("-")
                names["first name"].append(entry[0])
            names["last name"].append(entry[1])
        continue
    #elif "," in entry:
        
        
    
    
    names["first name"].append("-")
    names["title"].append("-")

In [None]:
len(names["title"])

## Namensregister


In [None]:
import pandas as pd

In [None]:
import re

In [None]:
o_pattern = re.compile("\do:|\d|\.| |;")
o_replace_pattern = re.compile("\d0:|\d|\.| ")

In [None]:
# hacky, should use another re function for this
def replace_o(register):
    for j, line in enumerate(register):
        if "o" in line:
            for i,char in enumerate(line):
                if char == "o":
                    to_check = line[i-1:i+2]
                    if re.match(o_pattern,to_check):
                        register[j] = line[:i] + "0" + line[i+1:]
    return register

In [None]:
pattern_date_issue = re.compile("\d+:\d+\.\d+\.\d+")
patter_date = re.compile("\d+\.\d+\.\d+")

In [None]:
TITLES = ["Dr.", "Prof.", "Rabb."]
DATE_OF_DEATH = ["Nachruf", "Todesanzeige"]


def extract_data(name_lines):
    data = []
    
    for line in name_lines:
        result = {}
        if len(line[0].split(",")) == 2:
            last, first = line[0].split(",")
        else:
            line = line[0].split(",")
            last = line[0]
            first = ".".join(line[1:])
        if not len(last) == 0 and not last[0].isalpha():
            continue
                
        for title in TITLES:
            if first.startswith(title):
                first = first.split(title)[1]
                result["title"] = title
                break
        if "(" in first and ")" in first:
            split_first = first.split("(")
            first = split_first[0]
            if not all(element.isalpha() for element in split_first[1]):
                date = split_first[1].split(")")[0]
                if date[0].isdigit():
                    result["Geburt/Tod"] = date
        result["first name"] = first
        result["last name"] = last

        other = []
        for element in line[1:]:
            if element in DATE_OF_DEATH:
                result["nachruf/tode"] = element
            elif pattern_date_issue.match(element):
                element = element.split(":")
                result["jub"] = element[0]
                #result["issue_date"] = element[1]

                el = element[1]
                if "(" in el:
                    other.extend(el.split("(")[1:])
                    result["issue_date"] = el.split("(")[0]
                else:
                    result["issue_date"] = el

            elif patter_date.match(element):
                if "(" in element:
                    other.extend(element.split("(")[1:])
                    result["issue_date"] = element.split("(")[0]
                else:
                    result["issue_date"] = element

            else:
                other.append(element)
        result["other"] = "  ".join(other)
        data.append(result)
        
    return data

In [None]:
KEYS = ["last name","first name","title", "nachruf/tode", "jub", "issue_date", "other", "Geburt/Tod"]
def fill_blanks(data):
    for entry in data:
        for key in KEYS:
            if key not in entry:
                entry[key] = "-"
    return data

In [None]:
ID_NAME = "MBNR."

In [None]:
def get_fp_id(fp):
    fp.name.split("_")

In [None]:
def convert_to_table(txt_fp, txt_out):
    register = read(txt_fp)
    register = replace_o(register)
    split_lines = [line.split(" ") for line in register]
    name_lines = [line for line in split_lines if "," in line[0]]
    
    data = extract_data(name_lines)
    data = fill_blanks(data)
    df = pd.DataFrame(data)
        
    id_prefix = ID_NAME + txt_fp.name.split("_")[0] + '.PER.'
    id_col = []
    for i in range(len(df)):
        id_col.append(id_prefix + str(i))

    df["ID"] = id_col
    df.to_csv(txt_out, index=False, sep=";")

In [None]:
from pathlib import Path

In [None]:
input_dir = Path("namensregister/OCR/")
output_dir = Path("namensregister/csv/")

for f in input_dir.iterdir():
    f_new = f.stem + ".csv"
    fp_new = output_dir / f_new
    
    convert_to_table(f, fp_new)

### Name register full structuring

In [None]:
from collections import OrderedDict
import json

In [None]:
def get_name(line):
    if "," in line[0]:
        if len(line[0].split(",")) == 2:
                last, first = line[0].split(",")
        else:
            line_split = line[0].split(",")
            last = line_split[0]
            # fix title 
            first = ".".join(line_split[1:])
        remaining = line[1:]
        if not len(last) == 0 and last[0].isalpha():
            if len(first) == 0:
                if line[1] in TITLES:
                    if line[2].isalpha():
                        first = "".join(line[1:3])
                        remaining = line[3:]
                elif line[1][:2].isalpha():
                        first = line[1]
                        remaining = line[2:]
            return first, last, remaining
    return False, False, False

In [None]:
# {id: {name: {first:, last:, title:}, info: []}}

def extract_name_info(name_lines, id_key):
    name_info = OrderedDict()
    
    for i, line in enumerate(name_lines):
        first, last, remaining = get_name(line)
        if last:
            per_id = id_key + str(i)
            name_info[per_id] = {"name":{}, "info":[]}
            
            for title in TITLES:
                if first.startswith(title):
                    first = first.split(title)[1]
                    name_info[per_id]["name"]["title"] = title
                    break
            
            if "(" in first:
                split_first = first.split("(")
                first = split_first[0]
                remaining_first = "(" + "(".join(split_first[1:])
                name_info[per_id]["info"].append(remaining_first)
        
            name_info[per_id]["name"]["first"] = first
            name_info[per_id]["name"]["last"] = last    
            remaining_line = " ".join(remaining)
            name_info[per_id]["info"].append(remaining_line)
        else:
            if len(name_info) > 0:
                last_key = list(name_info.keys())[-1]
                name_info[last_key]["info"].append(" ".join(line))
    return name_info

In [None]:
def convert_to_json(txt_fp):
    
    register = read(txt_fp)
    register = replace_o(register)
    split_lines = [line.split(" ") for line in register]
    clean_lines = [line for line in split_lines if (len(line) != 1 and line[0] != "")]

    id_prefix = ID_NAME + txt_fp.name.split("_")[0] + '.PER.'
    data = extract_name_info(clean_lines, id_prefix)
    return data

In [None]:
def convert_to_json_print(txt_fp, out_fp):
    
    register = read(txt_fp)
    register = replace_o(register)
    split_lines = [line.split(" ") for line in register]
    clean_lines = [line for line in split_lines if (len(line) != 1 and line[0] != "")]

    id_prefix = ID_NAME + txt_fp.name.split("_")[0] + '.PER.'
    data = extract_name_info(clean_lines, id_prefix)
    
    with open(out_fp, "w") as json_out:
        json.dump(data, json_out, indent=4)
    return data

In [None]:
# save results in one dictionary
input_dir = Path("namensregister/OCR/")
out_fp = Path("namensregister/json/MB_namenregister.json")
all_registers = {}
for f in input_dir.iterdir():
    if f.is_file():
        result = convert_to_json(f)
        all_registers.update(result)

In [None]:
with open(out_fp, "w") as json_out:
    json.dump(all_registers, json_out, indent=4)

In [None]:
input_dir = Path("namensregister/OCR/")
output_dir = Path("namensregister/json/")

# need to exclude : 12_MB-RegisterJuni1985-Juli1990.pdf.txt - 
# use 12_MB-RegisterJuni1985-Juli1990-register-only.pdf.txt instead
# save results by MB

all_registers = {}
for f in input_dir.iterdir():
    if f.is_file():
        f_new = f.stem + ".json"
        fp_new = output_dir / f_new
        result = convert_to_json_print(f, fp_new)
        all_registers[f.stem] = result

### Link to resolved mentions

In [None]:
parantheses = r"\([A-Za-z\s,.]+\)"

In [None]:
contributor_fp = "namensregister/author_information/mb-identified-contributors.txt"
def load_contributors(fp):
    contributors = {}
    with open(fp, 'r') as txt_in:
        for line in txt_in.readlines():
            line = line.strip().split(":")
            abbr = line[-1].split(",")
            last, first = line[0].split(", ")
            gid = "https://d-nb.info/gnd/" + line[1]
            info = {"last": last, "first":first, "GND-URI":gid}
            for entry in abbr:
                contributors[entry] = info
    return contributors

In [None]:
contributors = load_contributors(contributor_fp)

In [None]:
len(all_registers)

In [None]:
def add_authors_info(all_registers, contributors):
    count = 0
    for per_id, per_info in all_registers.items():
        info = per_info["info"]
        for entry in info:
            parentheses_found = re.findall(parantheses, entry)
            if len(parentheses_found) > 0:
                for p_entry in parentheses_found:
                    for con in contributors:
                        if con in p_entry:
                            count += 1
                            if "authors" not in all_registers[per_id]:
                                all_registers[per_id]["authors"] = {}
                            all_registers[per_id]["authors"][con] = contributors[con]
    print(f"Added authors to {count} persons")
    return all_registers          

In [None]:
all_registers = add_authors_info(all_registers, contributors)

In [None]:
all_registers["MBNR.1.PER.185"]

In [None]:
with open("namensregister/json/MB_namenregister-authors.json", 'w') as j_out:
     json.dump(all_registers, j_out, indent=4)