# Star2BF - Star to Bibframe

Import libraries:

In [1]:
from rdflib import Graph, Literal
from rdflib.namespace import RDF, RDFS, XSD, Namespace
from rdflib import BNode
from rdflib import URIRef
import xml.etree.ElementTree as ET
import re
import html
import modules.mappings as mappings
# import modules.open_science as open_science
import requests_cache
from datetime import timedelta

# old fuzzy compare for reconciliations: using fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# new fuzzy compare: using the faster rapidfuzz as a drop-in replacement for fuzzywuzzy:
# from rapidfuzz import fuzz
# from rapidfuzz import process

import csv

# ror lookup
ROR_API_URL = "https://api.ror.org/organizations?affiliation="  

from modules.mappings import funder_names_replacelist

# set up friendly session by adding mail in request:
CROSSREF_FRIENDLY_MAIL = "&mailto=ttr@leibniz-psychology.org"
# for getting a list of funders from api ():
CROSSREF_API_URL = "https://api.crossref.org/funders?query="

urls_expire_after = {
    # Custom cache duration per url, 0 means "don't cache"
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/09183&lang=de': 0,
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/': 0,
}
# using cache for ror requests
session = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)
# and a cache for the crossref api:
session_fundref = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)

# import csv of LUX authority institutes:
with open('institute_lux.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    # save it in a list:
    lux_institutes = list(reader)
    # split string "known_names" into a list of strings on "##":
    for institute in lux_institutes:
        institute["known_names"] = institute["known_names"].split(" ## ")
# print("Und die ganze Tabelle:")
# print(dachlux_institutes)


Create an "element tree" from the records in my xml file so we can loop through them and do things with them:

In [2]:
# root = ET.parse("xml-data/records-440.xml")
# root = ET.parse("xml-data/records-322.xml")
# root = ET.parse("xml-data/records-395.xml")
# root = ET.parse("xml-data/records-214.xml")
root = ET.parse("/home/tina/Developement/psyndex-workflows/star-to-rdf/data/230424_000956/xml/records-556.xml")

# To see the source xml's structure, uncomment this function:
# def print_element(element, depth=0):
#     print("\t"*depth, element.tag, element.attrib, element.text)
#     for child in element:
#         print_element(child, depth+1)

# for child in root.getroot()[:2]:
#     print_element(child)


We first set a few namespace objects for bibframe, schema.org and for our resources (the works and instances) 
themselves.

Then, we create two graphs from the xml source file, one to generate triples for our bibframe profile output, and the other for the simplified schema.org profile. 

Finally, we bind the prefixes with their appropriate namespaces to the graphs.

In [3]:
BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
BFLC = Namespace("http://id.loc.gov/ontologies/bflc/")
MADS = Namespace("http://www.loc.gov/mads/rdf/v1#")
SCHEMA = Namespace("https://schema.org/")
WORKS = Namespace("https://w3id.org/zpid/resources/works/")
INSTANCES = Namespace("https://w3id.org/zpid/resources/instances/")
PXC = Namespace("https://w3id.org/zpid/ontology/classes/")
PXP = Namespace("https://w3id.org/zpid/ontology/properties/")
LANG = Namespace ("http://id.loc.gov/vocabulary/iso639-2/")
LOCID = Namespace("http://id.loc.gov/vocabulary/identifiers/")
CONTENTTYPES = Namespace("http://id.loc.gov/vocabulary/contentTypes/")
ROLES = Namespace("https://w3id.org/zpid/vocabs/roles/")
RELATIONS = Namespace("https://w3id.org/zpid/vocabs/relations/")
GENRES = Namespace("https://w3id.org/zpid/vocabs/genres/")


# graph for bibframe profile:
records_bf = Graph()
# make the graph named:
records_bf = Graph(identifier=URIRef("https://w3id.org/zpid/bibframe/records/"))

kerndaten = Graph()
kerndaten.parse("ttl-data/kerndaten.ttl", format="turtle")    

# import graph for crossref funder registry dump:
# crossref_funders = Graph()
# crossref_funders.parse("crossref_fundref_registry.rdf", format="xml")
# we need a new graph for the schema.org profile, so it won't just reuse the old triples from the other profile
# records_schema = Graph()

# Bind the namespaces to the prefixes we want to see in the output:
records_bf.bind("bf", BF) 
records_bf.bind("bflc", BFLC) 
records_bf.bind("works", WORKS)  
# records_schema.bind("works", WORKS) 
records_bf.bind("instances", INSTANCES) 
records_bf.bind("pxc", PXC) 
records_bf.bind("pxp", PXP) 
records_bf.bind("lang", LANG) 
records_bf.bind("schema", SCHEMA) 
records_bf.bind("locid", LOCID) 
records_bf.bind("mads", MADS) 
records_bf.bind("roles", ROLES) 
records_bf.bind("relations", RELATIONS)
records_bf.bind("genres", GENRES)
records_bf.bind("contenttypes", CONTENTTYPES)


# Functions to do all the things

We need functions for the different things we will do - to avoid one long monolith of a loop.

This is where they will go. Examples: Create blank nodes for Idebtifiers, create nested contribution objects from disparate person entries in AUP, AUK, CS and COU fields, merge PAUP (psychauthor person names and ids) with the person's name in AUP...

These functions will later be called at the bottom of this notebook, in a loop over all the xml records.

## TODO: Splitting Instances from single records with MT and MT2

A record that has two media types (both a MT and a MT2 field) actually contains two instances.

Let's start with books, first: Records with BE=SS or SM. Usually, when there are two media types, MT is "Print" and MT2 is "Online Medium" or vice versa.

So we go through each record with BE=SS (or SM) and check for MT and MT2. If both are present, we create two instances, one for each media type. We will first describe this by giving them additional classes: bf:Electronic for Online Medium and bf:Print for Print.

We will also add a property to the instance that links it to its other format, via bf:otherPsysicalFormat.


In [4]:
# function to set mediaCarrier from mt and mt2:
from arrow import get


def get_mediacarrier(mediatype):
    cases = [
        ("Print", "Print"),
        ("Online Medium", "Electronic"),
        ("eBook", "Electronic"),
    ]
    for case in cases:
        if case[0] == mediatype:
            return URIRef(BF[case[1]])
    return URIRef(BF[mediatype])

def get_publication_info(instance_uri, record, mediatype):
    # get the publication info:
    pu = None
    pu = record.find("PU")
    pufield = html.unescape(pu.text.strip())
    if pu is not None and pufield != "":
        # split out the content after |e:
        pub_lisher = pufield.split("|v")
        pub_place = pufield.split("|o")
        p_isbn = pufield.split("|i")
        e_isbn = pufield.split("|e")
        # add a bf:provisionActivity to the instance:
        publication_node = BNode()
        records_bf.add((instance_uri, BF.provisionActivity, publication_node))
        # add the bf:place to the bf:provisionActivity:
        if len(pub_place) > 1:
            records_bf.add((publication_node, BFLC.simplePlace, Literal(str(pub_place[1]).strip())))
        # add the pub_lisher to the bf:provisionActivity as bflc:simpleAgent:
        if len(pub_lisher) > 1:
            records_bf.add((publication_node, BFLC.simpleAgent, Literal(str(pub_lisher[1]).strip())))
        # add the p_isbn to the instance:
        isbn_node = BNode()
        records_bf.add((instance_uri, BF.identifiedBy, isbn_node))
        records_bf.add((isbn_node, RDF.type, BF.Isbn))
        if get_mediacarrier(mediatype) == BF.Electronic:
            records_bf.add((instance_uri, BF.identifiedBy, isbn_node))
            records_bf.add((isbn_node, RDF.type, BF.Isbn))
            records_bf.add((isbn_node, RDF.value, Literal(str(e_isbn[1]).strip())))
        else:
            records_bf.add((isbn_node, RDF.value, Literal(str(p_isbn[1]).strip())))

def split_books(instance_uri, record):
    # check the BE field to see if it is "SS" or "SM":
    be = None
    be = record.find("BE")
    befield = be.text.strip()
    if be is not None and befield == "SS" or befield == "SM":
        mt=None
        mt2 = None
        mtfield = html.unescape(record.find("MT").text.strip())
        mt2field = html.unescape(record.find("MT2").text.strip())
        # we should check if there is an "e isbn" somewhere in PU subfield |e:
        
        if mt is not None and mtfield != "":
            # note the content of the MT field and use get_mediacarrier to get the corresponding bibframe instance class:
            # add the resulting bf class to the instance:
            # print("It's a book! Subclass: " + str(get_mediacarrier(mt.text)))
            records_bf.add((instance_uri, RDF.type, get_mediacarrier(mtfield)))
            
        
        if mt2 is not None and mt2field != "":
            # use get_mediacarrier to get the corresponding bibframe instance class:
            # add the resulting bf class to the instance:
            # print("It's also a subclass: " + str(get_mediacarrier(mt2.text)))
            # we add another instance for the second book:
            instance2 = BNode()
            records_bf.add((instance2, RDF.type, BF.Instance))
            records_bf.add((instance2, RDF.type, get_mediacarrier(mt2field)))
            records_bf.add((instance_uri, BF.otherPhysicalFormat, instance2))
            
       


## Semi-generic helper functions

### Getting subfields from a field

In [5]:
def get_subfield(subfield_full_string, subfield_name):
    """Given a string that contains star subfields (|name ) and the name of the subfield,
e.g. i for |i, return the content of only that subfield as a string."""
    # strip out any double spaces and replace with single space, also strip spaces around:
    subfield_full_string = re.sub(' {2,}', ' ', subfield_full_string.strip())
    subfield = subfield_full_string.split(f"|{subfield_name}")[1].strip().split("|")[0].strip()
    # print(subfield)
    return subfield 

### Getting URLs and DOIs from a field
Converting http-DOIs to pure ones, checking if what looks like url really is one.

In [6]:
def check_for_url_or_doi(string):
    """checks if the content of the string is a doi or url or something else.
       Returns the a string and a string_type (doi, url, unknown). The given string 
       is sanitized, eg. missing http protocol is added for urls; dois are stripped
       of web protocols and domain/subdomains like dx, doi.org)."""
    # use a regex: if string starts with "DOI " or "DOI:" or "DOI: " (case insensitive), remove that and strip again:
    error_pattern = re.compile(r"^(DOI:|DOI |DOI: )", re.IGNORECASE)
    string = error_pattern.sub("", string).strip()
    # replace double spaces with single space and single space with underscore, 
    # fixing a known STAR bug that replaces underscores with spaces, 
    # which is especially bad for urls.  (In other text, 
    # we can't really fix it, since usually a space was intended):
    string = re.sub(' {2,}', ' ', string)
    string = re.sub(" ", "_", string)
    doi_pattern = re.compile(r"^(https?:)?(\/\/)?(dx\.)?doi\.org\/?(.*)$")
    if doi_pattern.search(string):
        # remove the matching part:
        string = doi_pattern.search(string).group(4)
        string_type = "doi"
        # print("DOI: " + doi)
    elif string.startswith("10."):
        # if the string starts with "10." the whole thing is a DOI:
        string_type = "doi"
        # print("DOI: " + doi)
        # proceed to generate an identifier node for the doi:
    else:
        # doi = None
        # check for validity of url using a regex:
        url_pattern = re.compile(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", re.IGNORECASE)
        if url_pattern.search(string):
            # if it's a nonstandard url starting with "//", add a "http:" protocol to the start:
            if string.startswith("//"):
                string = "http:" + string
                # or if it starts with a letter (like osf.io/), add "http://" to the start:
            elif string[0].isalpha() and not string.startswith("http"):
                string = "http://" + string
            string_type = "url"
            # print("URL: " + datac_url)
        else:
            # url = None
            string_type = "unknown"
            # print("Das ist weder eine DOI noch eine URL: " + string)
    return string, string_type

### Building identifier nodes for DOIs

Should probably refactor to be more general, so we can use it for other identifiers as well. Needs a parameter for the identifier type.

In [7]:
def build_doi_identifier_node(instance, doi):
    # print(f"bf:identifiedBy > bf:Doi > rdf:value: {doi}.")
    # make bnode for the identifier:
    identifier_node = BNode()
    # give it class bf:Doi:
    records_bf.add((identifier_node, RDF.type, BF.Doi))
    # give it the doi as a literal value:
    records_bf.add((identifier_node, RDF.value, Literal(doi)))
    # attach it to the instance with bf:identifiedBy:
    records_bf.add((instance, BF.identifiedBy, identifier_node))

### Building "Links" as electronic locator nodes for an instance

In [8]:
def build_electronic_locator_node(instance, url):
    locator_node = BNode()
    # add it to the instance_node of relationship_node via bf:electronicLocator:
    # no specific class!
    # give it the url as a literal value:
    records_bf.set((locator_node, RDF.value, Literal(url, datatype=XSD.anyURI)))
    # attach it to the instance with bf:electronicLocator:
    records_bf.set((instance, BF.electronicLocator, locator_node))

### Building generic bf:Note nodes

Will probably also need this later for other kinds of notes, such as the ones in field BN.

In [9]:
def build_note_node(instance, note):
    note_node = BNode()
    records_bf.set((note_node, RDF.type, BF.Note))
    records_bf.set((note_node, RDFS.label, Literal(note)))
    records_bf.set((instance, BF.note, note_node))

## Function: Replace weird characters with unicode



In [10]:
# from modules.mappings import dd_codes

# def replace_encodings(text):
#     # text = html.escape(text)
#     for case in dd_codes:
#         text = text.replace(case[0], case[1]) 
#     return text

# moved to modules.mappings!

## Function: Guess language of a given string
Used for missing language fields or if there are discrepancies between the language field and the language of the title etc.

In [11]:
import langid
langid.set_languages(["de", "en"])

def guess_language(string_in_language):
    return (langid.classify(string_in_language)[0])

## Function: Adding DFK as an Identifier

### DFK as id for Bibframe

We want to add the DFK as a local bf:Identifier to the work (or instance?). 
We also want to say where the Identifier originates (to say it is from PSYNDEX/ZPID). 

The format for that is:
```turtle
<Work/Instance> bf:identifiedBy [
    a bf:Local, pxc:DFK; 
    rdf:value "1234456"; 
    bf:source [
        a bf:Source; bf:code "ZPID.PSYNDEX.DFK"
    ]
];
```

So, we need a blank node for the Identifier and inside, another nested bnode for the bf:Source. This is a function that will return such an identifier bnode to add to the work_uri. We are calling it way up down below in the loop:

In [12]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_identifier_dfk(instance_uri, dfk):
    # make a  BNODE of the Identifier class from the BF namespace:
    identifier = BNode()
    #identifier = URIRef(instance_uri + "/identifier/dfk")
    identifier_source = BNode()
    # records_bf.add ((identifier, RDF.type, BF.Identifier))
    records_bf.add ((identifier, RDF.type, BF.Local))
    records_bf.add ((identifier, RDF.type, PXC.DFK))
    # build the source node:
    records_bf.add((identifier_source, RDF.type, BF.Source))
    records_bf.add((identifier_source, BF.code, Literal("ZPID.PSYNDEX.DFK")))

    # hang the id source node into the id node:
    records_bf.add((identifier, BF.source, identifier_source))
    records_bf.add((identifier, RDF.value, Literal(dfk)))
    return (identifier)

## Generic Function: Replace languages with their language tag

Can be used for different fields that are converted to langstrings or language uris. Use within other functions that work with the languages in different fields.

Returns an array with two values: a two-letter langstring tag at [0] and a three-letter uri code for the library of congress language vocab at [1].

In [13]:
def get_langtag_from_field(langfield):
    # when passed a string from any language field in star, returns an array with two items. 
    # Index 0: two-letter langstring tag, e.g. "de"
    # Index 1: two-letter iso langtag, e.g. "ger"
    # can be used on these fields (it contains the different spellings found in them):
    # "LA", "LA2", "TIL", "TIUL", "ABLH", "ABLN", "TIUE |s"
    match langfield:
        case "german" | "de" | "GERM" | "Deutsch" | "GERMAN" | "GERMaN" | "German" | "Fi":
            return ["de", "ger"]
        case "en" | "ENGL" | "ENGLISH" | "Englisch" | "English" | "English; English" | "english" :
            return ["en", "eng"]
        case "BULG" | "Bulgarian":
            return ["bg", "bul"]
        case "SPAN"| "Spanish":
            return ["es", "spa"]
        case "Dutch":
            return ["nl", "dut"]
        case "CZEC":
            return ["cs", "ces"]
        case "FREN" | "French":
            return ["fr", "fra"]
        case "ITAL" | "Italian":
            return ["it", "ita"]
        case "PORT" | "Portuguese":
            return ["pt", "por"]
        case "JAPN" | "Japanese":
            return ["jp", "jpn"]
        case "HUNG":
            return ["hu", "hun"]
        case "RUSS" | "Russian":
            return ["ru", "rus"]
        case "NONE" | "Silent":
            return ["zxx", "zxx"]
        case _:
            return ["und", "und"] # for "undetermined!"




## Function: Get work language from LA

Example

```turtle
@prefix lang: <http://id.loc.gov/vocabulary/iso639-2/> .
<W> bf:language lang:ger .
```

Calls the generic language code lookup function above, get_langtag_from_field, passing the LA field content, returning a uri from the library of congress language vocabulary (built from namespace + 3-letter iso code). 

TODO:
- But what if field LA is missing? (doesn't occur in test set, but not impossible)
- or if there is another language in LA2? (in my test set, 2 out of 700 records have LA2)

In [14]:
# function 
def get_work_language(record):
    work_language = get_langtag_from_field(record.find("LA").text.strip())[1]
    work_lang_uri = LANG[work_language]
    return (work_lang_uri)

## Function: Build a Relationship Node for different types of related works

Should take parameters - a dict per type (research data closed access, rd open access, ...) that has values for all the needed fields

In [15]:
from uri_template import URITemplate


def build_work_relationship_node(work_uri, relation_type):
    # check the relation_type against the relation_types dict:
    if relation_type in relation_types:
        # if it is, get the values for the relation_type:
        relation = relation_types[relation_type]["relation"]
        relatedTo_subprop = relation_types[relation_type]["relatedTo_subprop"]
        work_subclass = relation_types[relation_type]["work_subclass"]
        content_type = relation_types[relation_type]["content_type"]
        genre = relation_types[relation_type]["genre"]
        access_policy_label = relation_types[relation_type]["access_policy_label"]
        access_policy_value = relation_types[relation_type]["access_policy_value"]
    # make a bnode for this relationship:
    relationship_bnode = BNode()
    # make it class bflc:Relationship:
    records_bf.set((relationship_bnode, RDF.type, BFLC.Relationship))
    # add a bflc:Relation (with a label and value) via bflc:relation to the relationship bnode 
    # (label and value could be given as a parameter):
    # print("\tbflc:relation [a bflc:Relation ; rdfs:label 'has research data', rdf:value 'relation:hasResearchData'^^xsd:anyURI] ;")
    # relation_bnode = BNode()
    # records_bf.set((relation_bnode, RDF.type, BFLC.Relation))
    # records_bf.add((relation_bnode, RDFS.label, Literal("has research data")))
    # records_bf.add((relation_bnode, RDF.value, Literal(RELATIONS.hasResearchData)))
    records_bf.set((relationship_bnode, BFLC.relation, URIRef(RELATIONS[relation])))
    # make a bnode for the work:
    related_work_bnode = BNode()
    records_bf.add((related_work_bnode, RDF.type, BF.Work))
    records_bf.add((related_work_bnode, RDF.type, URIRef(BF[work_subclass])))
    # give work a content type:
    records_bf.add((related_work_bnode, BF.content, URIRef(CONTENTTYPES[content_type])))
    # and a genre:
    records_bf.add((related_work_bnode, BF.genre, URIRef(GENRES[genre])))
    # attach the work bnode to the relationship bnode with bf:relatedTo 
    # (or a subproperty as given as a parameter)):
    # print("\tbf:relatedTo [a bf:Work ;")
    records_bf.add((relationship_bnode, BF[relatedTo_subprop], related_work_bnode))
    # make a bnode for the instance:
    related_instance_bnode = BNode()
    records_bf.set((related_instance_bnode, RDF.type, BF.Instance))
    records_bf.add((related_instance_bnode, RDF.type, BF.Electronic))
    records_bf.add((related_work_bnode, BF.hasInstance, related_instance_bnode))
    # add accesspolicy to instance:
    if access_policy_label is not None and access_policy_value is not None:
        access_policy_node = BNode()
        records_bf.add((access_policy_node, RDF.type, BF.AccessPolicy))
        records_bf.add((access_policy_node, RDFS.label, Literal(access_policy_label, lang="en")))
        records_bf.add((access_policy_node, RDF.value, Literal(access_policy_value, datatype=XSD.anyURI)))
        records_bf.add((related_instance_bnode, BF.usageAndAccessPolicy, access_policy_node))
    # in the end, return the relationship bnode so it can be attached to the work
    # records_bf.add((work_uri, BFLC.relationship, relationship_bnode))
    return relationship_bnode, related_instance_bnode

relation_types = {
    "rd_open_access": {
        "relation": "hasResearchData",
        "relatedTo_subprop": "supplement",
        "work_subclass": "Dataset",
        "content_type": "cod",
        "genre": "researchData",
        "access_policy_label": "open access",
        "access_policy_value": "http://purl.org/coar/access_right/c_abf2"
    },
    "rd_restricted_access": {
        "relation": "hasResearchData",
        "relatedTo_subprop": "supplement",
        "work_subclass": "Dataset",
        "content_type": "cod",
        "genre": "researchData",
        "access_policy_label": "restricted access",
        "access_policy_value": "http://purl.org/coar/access_right/c_16ec"
    },
    "preregistration": {
        "relation": "hasPreregistration",
        "relatedTo_subprop": "supplement",
        "work_subclass": "Text",
        "content_type": "txt",
        "genre": "preregistration",
        "access_policy_label": None,
        "access_policy_value": None,
    },
}

## Function: Create Instance Title nodes from fields TI, TIU, TIL, TIUE...

Titles and Translated titles are attached to Instances. Translated titles also have a source, which can be DeepL, ZPID, or Original.

Example:

```turtle
<Instance> bf:title 
        [a bf:Title; 
            bf:mainTitle "Disentangling the process of epistemic change"@en;
            bf:subtitle "The role of epistemic volition"@en;
        ],
        [a pxc:TranslatedTitle;
            rdfs:label "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:mainTitle "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:adminMetadata  [ 
                a bf:AdminMetadata ;
                bflc:metadataLicensor  "DeepL";
        ]
        ].
```

- [x] add TI as bf:Title via bf:mainTitle
- [x] add subtitle from TIU
- [x] create a concatenated rdfs:label from TI and TIU
- [x] add languages for maintitle and subtitle (from TIL and TIUL)

- [x] add translated title from TIUE as pxc:TranslatedTitle with bf:mainTitle and rdfs:label 
- [x] add languages for translated title (from subfield TIU |s, or if unavailable, guess language from the subtitle string itself (contents of TIU)
- [x] create a source/origin for the translated title (from "(DeepL)" at the end)

In [16]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_title(instance_uri, record):
    # make a  BNODE for the title:
    title = BNode()
    # title = URIRef(instance_uri + "/title")
    # make it bf:Title class:
    records_bf.add ((title, RDF.type, BF.Title))

    # get the content of th TI field as the main title:
    maintitle = html.unescape(mappings.replace_encodings(record.find("TI").text).strip())
    # write a full title for the rdfs:label 
    # (update later if subtitle exists to add that)
    fulltitle = maintitle
    # set fallback language for main title:
    maintitle_language = "en"
    subtitle_language = "en"
    # get language of main title - if exists!:
    if record.find("TIL") is not None:
        maintitle_language = get_langtag_from_field(record.find("TIL").text.strip())[0]
        # if maintitle_language that is returned the get_langtag_from_field is "und" 
        # (because it was a malformed language name), guess the language from the string itself!
        if maintitle_language == "und":
            maintitle_language = guess_language(maintitle)
    else: # if there is no TIL field, guess the language from the string itself!
        maintitle_language = guess_language(maintitle)

   

    # add the content of TI etc via bf:mainTitle:
    records_bf.add((title, BF.mainTitle, Literal(maintitle, lang=maintitle_language)))
    # get content of the TIU field as the subtitle, 
    # _if_ it exists and has text in it:
    if record.find("TIU") is not None and record.find("TIU").text != "":
        subtitle = html.unescape(mappings.replace_encodings(record.find("TIU").text).strip()) # sanitize encoding and remove extraneous spaces
        # concatenate a full title from main- and subtitle, 
        # separated with a : and overwrite fulltitle with that
        fulltitle = fulltitle + ": " + subtitle
        # get language of subtitle - it is in field TIUL, but sometimes that is missing...:
        #  # get language of subtitle:
        if record.find("TIUL") is not None:
            subtitle_language = get_langtag_from_field(record.find("TIUL").text.strip())[0]
            if subtitle_language == "und":
                subtitle_language = guess_language(subtitle)
        else: # if there is no TIUL field, guess the language from the string itself!
            subtitle_language = guess_language(subtitle)

        # add the content of TIU to the bf:Title via bf:subtitle:
        records_bf.add((title, BF.subtitle, Literal(subtitle, lang=subtitle_language)))

    

    # add the concatenated full title to the bf:Title via rdfs:label:
    # (we don't care if the main title's and subtitle's languages don't match - we just set the language of the main title as the full title's language)
    records_bf.add((title, RDFS.label, Literal(fulltitle, lang=maintitle_language)))

    # # hang the id source node into the id node:
    # records_bf.add((identifier, BF.source, identifier_source))
    return (title)

# function for the translated title:
def get_bf_translated_title(instance_uri, record):
    translated_title = BNode()
    #translated_title = URIRef(instance_uri + "/title/translated")
    records_bf.add ((translated_title, RDF.type, PXC.TranslatedTitle))
    fulltitle = html.unescape(mappings.replace_encodings(record.find("TIUE").text).strip())
    fulltitle_language = "de"
    # read subfield |s to get the actual language (it doesn't always exist, though). 
    # if fulltitle string ends with "|s " followed by some text (use a regex):
    match = re.search(r'^(.*)\s\|s\s(.*)', fulltitle)
    if match:
        fulltitle = match.group(1).strip()
        fulltitle_language = get_langtag_from_field(match.group(2).strip())[0]
    else:
        # get the language in TIUE, if that field exists, and invert it to get the language of the translation:
        # if record.find("TIL") is not None:
        #     original_title_language_til = get_langtag_from_field(record.find("TIL").text.strip())[0]
            
        #     # if it is German -> use inverse: "en"
        #     if original_title_language_til == "de":
        #         fulltitle_language = "en"
        #     # else -> keep "de"
        # if the language of the translated title (in |s) is missing, guess the language from the string itself!
        fulltitle_language = guess_language(fulltitle)

    # check if the title contains a "(DeepL)" and cut it into a variable for the source:
    titlesource = "ZPID" # translation source is "ZPID" by default
    # note: we might be able to add source "Original" by finding out 
    # if the source of the secondary abstract is something other than ZPID!
    match_source = re.search(r'^(.*)\((DeepL)\)$', fulltitle)
    if match_source:
        fulltitle = match_source.group(1).strip()
        titlesource = match_source.group(2)

    # build a source node for the translation:
    titlesource_node = BNode ()
    records_bf.add ((titlesource_node, RDF.type, BF.AdminMetadata))
    records_bf.add ((titlesource_node, BFLC.metadataLicensor, Literal(titlesource)))

    # add the title string to the bnode:
    records_bf.add((translated_title, BF.mainTitle, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, RDFS.label, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, BF.adminMetadata, titlesource_node))

    return (translated_title)


## Function: Add Abstracts - original abstract (from fields ABH, ABLH, ABSH1, ABSH2) and translated/secondary abstract (from ABN, ABLN, ASN1, ASN2)

- Main Abstract: 
    - abstract text is in field ABH.
    - abstract language is in ABLH ("German" or "English") but can be missing in rare cases! In that case, we guess it using the langid module.
    - abstract original source is in ASH1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASH2 ()
- Secondary Abstract 
    - abstract text is in field ABN.
    - abstract language is in ABLN ("German" or "English")
    - abstract original source is in ASN1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASN2 ()

Scheme:

```turtle
<W> bf:summary 
    [ a pxc:Abstract , bf:Summary ;
        rdfs:label  "Background: Loneliness is ..."@en ;
        bf:adminMetadata  [ 
            a bf:AdminMetadata ;
            bflc:metadataLicensor  "Original";
            bf:descriptionModifier "ZPID"
        ]
] .
```

In [17]:
from modules.mappings import abstract_origin_original, abstract_origin_zpid, abstract_origin_deepl, abstract_origin_gesis, abstract_origin_fis_bildung, abstract_origin_krimz

def replace_abstract_origin_string(origin_string):
    # if the passed string is in "abstract_origin_original", thenreplace it with "Original":
    if origin_string in abstract_origin_original:
        return "Original"
    elif origin_string in abstract_origin_zpid:
        return "ZPID"
    # elif origin_string in abstract_origin_iwf:
    #     return "IWF"
    elif origin_string in abstract_origin_deepl:
        return "DeepL"
    elif origin_string in abstract_origin_gesis:
        return "GESIS"
    elif origin_string in abstract_origin_fis_bildung:
        return "FIS Bildung"
    elif origin_string in abstract_origin_krimz:
        return "KrimZ"
    else:
        return origin_string


# function to get the original abstract:
def get_bf_abstract(work_uri, record):
    """Extracts the abstract from field ABH and adds a bf:Summary bnode with the abstract and its metadata. Also extracts the Table of Content from the same field."""
    abstract = BNode()
    # abstract = URIRef(work_uri + "/abstract")
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    # get abstract text from ABH
    abstracttext = html.unescape(mappings.replace_encodings(record.find("ABH").text).strip())
    # check if the abstracttext ends with " (translated by DeepL)" and if so, remove that part:
    match1 = re.search(r'^(.*)\s\(translated by DeepL\)$', abstracttext)
    if match1:
        abstracttext = match1.group(1).strip()
    # check via regex if there is a " - Inhalt: " or " - Contents: " in it.
    # if so, split out what comes after. Drop the contents/inhalt part itself.
    match2 = re.search(r'^(.*)[-–]\s*(?:Contents|Inhalt)\s*:\s*(.*)$', abstracttext)
    if match2:
        abstracttext = match2.group(1).strip()
        contents = match2.group(2).strip()
        # make a node for bf:TableOfContents:
        toc = BNode()
        records_bf.add((toc, RDF.type, BF.TableOfContents))
        # add the bnode to the work via bf:tableOfContents:
        records_bf.add((work_uri, BF.tableOfContents, toc))
        # add the contents to the abstract node as a bf:tableOfContents:
        # if the contents start with http, extract as url into rdf:value:
        if contents.startswith("http"):
            records_bf.add((toc, RDF.value, Literal(contents, datatype=XSD.anyURI)))
            # otherwise it's a text toc and needs to go into the label
        else:
            records_bf.add((toc, RDFS.label, Literal(contents)))
    # get abstract language from ABLH ("German" or "English")
    abstract_language = "en" # set default
    # that's a bad idea, actually. Better: if field is missing, use a language recog function!
    if record.find("ABLH") is not None:
        abstract_language = get_langtag_from_field(record.find("ABLH").text.strip())[0]
        if abstract_language == "und":
            # guess language from the text:
            abstract_language = guess_language(abstracttext)
    else: # if the ABLH field is missing, try to recognize the language of the abstract from its text:
        abstract_language = guess_language(abstracttext)

    # add the text to the bnode:
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))

    # get abstract original source from ASH1 ("Original" or "ZPID")
    abstract_source = "Original" # default
    # create a blank node for admin metadata:
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))

    if record.find("ASH1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        # and while we're at it, replace some known strings with their respective values 
        # (e.g. employee tags with "ZPID"):
        abstract_source = replace_abstract_origin_string(record.find("ASH1").text.strip())
    
    # write final source text into source node:
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # here is a list of known zpid employee tags, we will use them later to replace these with "ZPID" if found in ASH2:

    # and this is a list of things we want to replace with "Original":
    

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASH2") is not None:
        # note what we find in ABSH2:
        abstract_editor = replace_abstract_origin_string(record.find("ASH2").text.strip())
        
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))


    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    #return (abstract)
# or better, attach it right away:
    records_bf.add((work_uri, BF.summary, abstract))

def get_bf_secondary_abstract(work_uri, record):
    abstract = BNode()
    # abstract = URIRef(work_uri + "/abstract/secondary")
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    records_bf.add ((abstract, RDF.type, PXC.SecondaryAbstract))
    abstracttext = html.unescape(mappings.replace_encodings(record.find("ABN").text).strip())
    # check if the abstracttext ends with " (translated by DeepL)" and if so, remove that part:
    match = re.search(r'^(.*)\s\(translated by DeepL\)$', abstracttext)
    if match:
        abstracttext = match.group(1).strip()

    abstract_language = "de" # fallback default
    
    if record.find("ABLN") is not None and record.find("ABLN").text != "":
        abstract_language = get_langtag_from_field(record.find("ABLN").text.strip())[0]
        if abstract_language == "und":
            # guess language from the text:
            abstract_language = guess_language(abstracttext)
    else: # if no language field, guess language from the text:
        abstract_language = guess_language(abstracttext)
    
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))
    
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))
    abstract_source = "Original" # fallback default
    if record.find("ASN1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        abstract_source = replace_abstract_origin_string(record.find("ASN1").text.strip())
    
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASN2") is not None:
        # note what we find in ABSN2:
        abstract_editor = replace_abstract_origin_string(record.find("ASN2").text.strip())
        # and add it via decription modifier:
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))

    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    return (abstract)




## Function to split Table of Content from the Abstract field (ABH)

This usually starts with " - Inhalt: " (for German Abstracts) or " - Contents: " (in English abstracts) and ends at the end of the field.
It can contain a numbered list of chapters or sections as a long string. It can also contain a uri from dnb namespace instead or in addition!

Examples:
- " - Contents: (1) ..."
- " - Inhalt: https://d-nb.info/1256712809/04</ABH>" (URI pattern: "https://d-nb.info/" + "1256712809" 10 digits + "/04")

Example:

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdfs:label "(1) Wünsche, J., Weidmann, R. &amp; Grob, A. (n. d.). Happy in the same way? The link between domain satisfaction and overall life satisfaction in romantic couples. Manuscript submitted for publication. (2) Wünsche, J., Weidmann,...";
] .
```

Or

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdf:value "https://d-nb.info/1002790794/04"^^xsd:anyURI ;
] .
```

In [18]:
def get_bf_toc(work_uri, record):
    # read the abstract in ABH
    contents = ""
    if record.find("ABH") is not None:
        abstracttext = html.unescape(mappings.replace_encodings(record.find("ABH").text).strip())
        # check via regex if there is a " - Inhalt: " or " - Contents: " in it.
        # if so, split out what comes after. Drop the contents/inhalt part itself.
        match = re.search(r'^(.*)[-–]\s*(?:Contents|Inhalt)\s*:\s*(.*)$', abstracttext)
        if match:
            abstracttext = match.group(1).strip()
            contents = match.group(2).strip()

    # also check if what comes is either a string or a uri following thegiven pattern
    # and export one as a rdfs_label and the other as rdf:value "..."^^xsd:anyUrl (remember to add XSD namespace!)
    # also remember that we should only create a node and attach it to the work
    # if a) ABH exists at all and
    # b) the regex is satisfied.
    # So I guess we must do the whole checking and adding procedure in this function!

    # only return an added triple if the toc exisits, otherwise return nothing:
    if contents:
        return records_bf.add((work_uri, BF.tableOfContents, Literal(contents)))
    else: 
        return None
    # return records_bf.add((work_uri, BF.tableOfContents, Literal("test")))

## Function: Create Person Contribution nodes from Fields AUP, EMID, EMAIL, AUK, PAUP, CS and COU

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:contribution 
    [
        # the Bibframe Contribution includes, as usual, an agent and their role,
        # but is supplemented with an Affiliation (in the context of that work/while it was written),
        # and a position in the author sequence.
        a bf:Contribution, bflc:PrimaryContribution; 
        bf:agent 
        [
            a bf:Person, schema:Person; 
            rdfs:label "Trillitzsch, Tina"; # name when creating work
            schema:givenName "Tina"; schema:familyName "Trillitzsch";
            owl:sameAs <https://w3id.org/zpid/person/tt_0000001>, <https://orcid.org/0000-0001-7239-4844>; # authority uris of person (local, orcid)
            bf:identifiedBy [a bf:Local, pxc:PsychAuthorsID; rdf:value "p01979TTR"; #legacy authority ID
            ];
            bf:identifiedBy [a bf:Identifier, locid:orcid; rdf:value "0000-0001-7239-4844"; # ORCID 
            ];
        ]
        # we use a model inspired by Option C in Osma Suominen'a suggestion for https://github.com/dcmi/dc-srap/issues/3
        # adding the Affiliation into the Contribution, separate from the agent itself, since the affiliation
        # is described in the context of this work, not not as a statement about the person's
        # current affiliation:
        mads:hasAffiliation [
            a mads:Affiliation;
            # Affiliation blank node has info about the affiliation org (including persistent identifiers),
            # the address (country with geonames identifier),
            # and the person's email while affiliated there.
            mads:organization [
                a bf:Organization; 
                rdfs:label "Leibniz Institute of Psychology (ZPID); Digital Research Development Services"; # org name when work was created
                owl:sameAs <https://w3id.org/zpid/org/zpid_0000001>, <https://ror.org/0165gz615>; # authority uris of org (local, ror)
                # internal id and ror id as literal identifiers:
                bf:identifiedBy [a bf:Local, pxc:ZpidCorporateBodyId; rdf:value "0000001"; ];
                bf:identifiedBy [a bf:Identifier; locid:ror; rdf:value "0165gz615"; ];
            ];
            mads:hasAffiliationAddress [a mads:Address;
                mads:country [
                    a mads:Country, bf:Place;
                    rdfs:label "Germany";
                    bf:identifiedBy [a bf:Identifier, locid:geonames; rdf:value "2921044"; ];
                    owl:sameAs <https://w3id.org/zpid/place/country/ger>;
                ]
            ];
            mads:email <mailto:ttr@leibniz-psychology.org>; # correspondence author email
        ];
        bf:role <http://id.loc.gov/vocabulary/relators/aut>;
        pxp:contributionPosition 1; bf:qualifier "first"; # first author in sequence: our own subproperty of bf:qualifier & schema:position (also: middle, last)
    ].
```

Todos:
- [x] create blank node for contribution and add agent of type bf:Person
- [x] add author position (first, middle, last plus order number) to the contribution
- [x] make first author a bflc:PrimaryContribution
- [x] match AUP with PAUP to get person names and ids (normalize first)
- [x] extend AUP-PAUP match with lookup in kerndaten table/ttl to compare schema:alternatename of person with name in AUP (but first before normalization)
- [x] add ORCID to the person's blank node (doesn't add 4 ORCIDs for unknown reason - maybe duplicates?)
- [x] add EMAIL to person's blank node (either to person in EMID or to first author)
- [x] add affiliation from CS field and COU field to first author
- [x] add Affiliation blank node with org name, country to each author that has these subfields in their AUP (|i and |c)
- [x] add role from AUP subfield |f
- [x] add country geonames id using lookup table
- [ ] move mads:email Literal from bf:Contribution to mads:Affiliation
- [ ] later: reconcile affiliations to add org id, org ror id (once we actually have institution authority files)


In [19]:
from modules.mappings import geonames_countries

def country_geonames_lookup(country):
    for case in geonames_countries:
        if case[0].casefold() == str(country).casefold():
            return case[0], case[1]
    return None

In [20]:
def sanitize_country_names(country_name):
    if country_name == "COSTA":
        country_name = "Costa Rica"
    elif country_name == "CZECH":
        country_name = "Czech Republic"
    elif country_name == "NEW":
        country_name = "New Zealand"
    elif country_name == "SAUDI":
        country_name = "Saudi Arabia"
    elif country_name == "PEOPLES":
        country_name = "People's Republic of China"
    return country_name

In [21]:




def add_bf_contributor_person_role(role):
    # return role_uri
    return URIRef(ROLES + role)



def normalize_names(familyname,givenname):
    familyname_normalized = familyname.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("Ä", "Ae").replace("Ö", "Oe").replace("Ü", "Ue").replace("ß", "ss")
    # generate an abbreviated version of givenname (only the first letter), but 
    if givenname:
        givenname_abbreviated = givenname[0] + "."
        # generate a normalized version of the name by concatenating the two with a comma as the separator:
        fullname_normalized = familyname_normalized + ", " + givenname_abbreviated
    return fullname_normalized

def match_paup(record, person_node, personname_normalized):
    # loop through all PAUPs and check if the name matches the normalized personname
    for paup in record.findall("PAUP"):
        # given a string such as "Forkmann, Thomas |n p06946TF |u https://www.psychauthors.de/psychauthors/index.php?wahl=forschung&amp;#38;uwahl=psychauthors&amp;#38;uuwahl=p06946TF"
        # split into family name, given name and paId where "Forkmann" is the family name, "Thomas" is the given name and "p06946TF" is the paId:
        paup_split = paup.text.strip().split("|n")[0].strip().split(",")
        if len(paup_split) > 1:
            paup_familyname = paup_split[0].strip()
            paup_givenname = paup_split[1].strip()
            paId = paup.text.strip().split("|n")[1].strip().split("|")[0].strip()
            
            # generate a normalized version of paup_familyname:
            paup_name_normalized = normalize_names(paup_familyname,paup_givenname)
            # generate a uri for the person from the paId that can match the one in kerndaten.ttl generated from psychauthors database:
            person_uri = URIRef("https://w3id.org/zpid/person/" + paId)
            
            # now check if the normalized name from PAUP matches the normalized person name from AUP:
            # if they match and there is a matching person in the kerndaten.ttl graph, add the person uri as schema:sameAs and the current preferred name from psychauthors as schema:preferredName, then return the paId:
            if paup_name_normalized == personname_normalized and (person_uri, RDF.type, SCHEMA.Person) in kerndaten:
                # for debugging, print the actual name in the matching PAUP:
                #records_bf.add((person_node, PXP.paupName, Literal(paup_familyname + ", " + paup_givenname)))
                records_bf.add((person_node, SCHEMA.sameAs, person_uri))
                # add the preferred name from kerndaten as schema:preferredName:
                #records_bf.add((person_node, SCHEMA.preferredName, kerndaten.value(person_uri, SCHEMA.name)))
                # return the psychauthors ID:
                return paId
            # but if PAUP and AUP names are no match, even normalized,
            # go through all the alternate names in kerndaten for that Psychauthors ID and check if they match the normalized person name from AUP (this will even find completely changed names, from maiden name to married name etc.):
            elif paup_name_normalized != personname_normalized and (person_uri, RDF.type, SCHEMA.Person) in kerndaten:
                for alternatename in kerndaten.objects(person_uri, SCHEMA.alternateName):
                    # split alternatename into first and last name:
                    alternatename_split = alternatename.split(",")
                    if len(alternatename_split) > 1:
                        alternatename_familyname = alternatename_split[0].strip()
                        alternatename_givenname = alternatename_split[1].strip()
                        # generate a normalized version of alternatename_familyname to compare with PAUP name later:
                        alternatename_normalized = normalize_names(alternatename_familyname,alternatename_givenname)
                        if personname_normalized == alternatename_normalized:
                            # we have found another match!
                            # add the uri as schema:sameAs and put the current preferred name from psychauthors here, too (for debugging purposes):
                            records_bf.add((person_node, SCHEMA.sameAs, person_uri))
                            #records_bf.add((person_node, SCHEMA.preferredName, kerndaten.value(person_uri, SCHEMA.name)))
                            return paId
            else:
                return None

def get_orcid(record, person_node, personname):
    # loop through all ORCIDs and check if the name matches the personname
    for orcid in record.findall("ORCID"):
        # go through all ORCID fields and check for matches of personname with the text before "|u": 
        # split the orcid string into the orcid id and the name:
        orcid_split = orcid.text.strip().split("|u")
        
        # if there is a name part, compare it to the personname:
        if len(orcid_split) > 1:
            orcid_name = mappings.replace_encodings(orcid_split[0]).strip()
            orcidId = orcid_split[1].strip()
            # clean up the orcid_id by removing spaces that sometimes sneak in when entering them in the database:
            orcidId = orcidId.replace(" ", "")
            
            # by the way, here is a regex pattern for valid orcids:
            orcid_pattern = re.compile(r'^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$')
            # and a way to check if the orcid id matches the pattern:
            # if not orcid_pattern.match(orcidId):
            #     print("invalid orcid: " + orcidId)
            # use try to check if orcidId is a valid orcid:
            if not orcid_pattern.match(orcidId): 
                print("invalid orcid: " + orcidId)    
            # if the name matches, return the orcid id for adding it to the person node:
            if orcid_name == personname:
                return orcidId
            else:
                #print("dangling orcid (no match): " + orcidId)
                return None
            

def get_local_authority_institute(affiliation_string, country):
    """Uses ~~fuzzywuzzy~~ RapidFuzz to look up the affilaition string in a local authority table loaded from a csv file."""
    if country == "LUXEMBOURG":
        best_match = process.extractOne(affiliation_string, lux_institutes, scorer=fuzz.token_set_ratio)
        return best_match[0].get("uuid")
    else: 
        return None

def get_ror_id_from_api(affiliation_string):
    # this function takes a string with an affiliation name and returns the ror id for that affiliation from the ror api
    # clean the string to make sure things like "^DDS" are replaced:
    #affiliation_string = my_xml_escape(affiliation_string)
    #replace_encodings(affiliation_string)
    ror_api_url = ROR_API_URL + affiliation_string
    # make a request to the ror api:
    # ror_api_request = requests.get(ror_api_url)
    # make request to api with caching:
    ror_api_request = session.get(
            ror_api_url, timeout=20
    )
    # if the request was successful, get the json response:
    if ror_api_request.status_code == 200:
        ror_api_response = ror_api_request.json()
        # check if the response has any hits:
        if len(ror_api_response["items"]) > 0:
            # if so, get the item with a key value pair of "chosen" and "true" and return its id:
            for item in ror_api_response["items"]:
                if item["chosen"] == True:
                    return item["organization"]["id"]
        else:
            return None
    else:
        return None

def build_affiliation_nodes(person_affiliation, person_affiliation_country):
    # person_affiliation = replace_encodings(person_affiliation)
    # is passed two string: the affiliation name and the affiliation country name
    # make a blank node for the affiliation and make it class mads:Affiliation:
    person_affiliation_node = BNode()
    records_bf.add((person_affiliation_node, RDF.type, MADS.Affiliation))
    # make a blank node for the affiliation organization and make it class bf:Organization:
    person_affiliation_org_node = BNode()
    records_bf.add((person_affiliation_org_node, RDF.type, BF.Organization))
    # add the affiliation organization node to the affiliation node:
    records_bf.add((person_affiliation_node, MADS.organization, person_affiliation_org_node))
    # add the affiliation string to the affiliation org node:
    records_bf.add((person_affiliation_org_node, RDFS.label, Literal(person_affiliation)))

    # do a ror lookup for the affiliation string
    # and if there is a ror id, add the ror id as an identifier:
    affiliation_ror_id = None
    affiliation_ror_id = get_ror_id_from_api(person_affiliation)
    
 
    if affiliation_ror_id is not None:
        # add a blank node fore the identifier:
        affiliation_ror_id_node = BNode()
        # make it a locid:ror:
        records_bf.add((affiliation_ror_id_node, RDF.type, LOCID.ror))
        records_bf.add((person_affiliation_org_node, BF.identifiedBy, affiliation_ror_id_node))
        # add the ror id as a literal to the identifier node:
        records_bf.add((affiliation_ror_id_node, RDF.value, Literal(affiliation_ror_id)))

    affiliation_local_id = None
    affiliation_local_id = get_local_authority_institute(person_affiliation, person_affiliation_country)
    if affiliation_local_id is not None:
        # add a blank node fore the identifier:
        affiliation_local_id_node = BNode()
        # make it a pxc:OrgID:
        records_bf.add((affiliation_local_id_node, RDF.type, PXC.OrgID))
        records_bf.add((person_affiliation_org_node, BF.identifiedBy, affiliation_local_id_node))
        # add the local uuid as a literal to the identifier node:
        records_bf.add((affiliation_local_id_node, RDF.value, Literal(affiliation_local_id)))

    # make a blank node for the affiliation address and make it class mads:Address:
    person_affiliation_address_node = BNode()
    records_bf.add((person_affiliation_address_node, RDF.type, MADS.Address))
    # add a country node to the affiliation address node:
    person_affiliation_country_node = BNode()
    records_bf.add((person_affiliation_country_node, RDF.type, MADS.Country))
    # add the country node to the affiliation address node:
    records_bf.add((person_affiliation_address_node, MADS.country, person_affiliation_country_node))
    # add the affiliation address string to the affiliation address node:
    records_bf.add((person_affiliation_country_node, RDFS.label, Literal(person_affiliation_country)))

    # if the country is in the geonames lookup table, add the geonames uri as sameAs and the geonames id as an identifier:
    if country_geonames_lookup(person_affiliation_country):
        improved_country_name, geonamesId = country_geonames_lookup(person_affiliation_country)
        # create a url to click and add it with sameas:
        # geonames_uri = URIRef("http://geonames.org/" + geonamesId + "/")
        # records_bf.add((person_affiliation_country_node, SCHEMA.sameAs, geonames_uri))
        # replace the country name in the affiliation address node with the improved country name:
        records_bf.add((person_affiliation_country_node, RDFS.label, Literal(improved_country_name)))
        # and remove the old label:
        records_bf.remove((person_affiliation_country_node, RDFS.label, Literal(person_affiliation_country)))
        # add the geonames identifier:
        person_affiliation_country_identifier_node = BNode()
        records_bf.add((person_affiliation_country_identifier_node, RDF.type, BF.Identifier))
        records_bf.add((person_affiliation_country_identifier_node, RDF.type, LOCID.geonames))
        records_bf.add((person_affiliation_country_identifier_node, RDF.value, Literal(geonamesId)))
        records_bf.add((person_affiliation_country_node, BF.identifiedBy, person_affiliation_country_identifier_node))
    # add the affiliation address node to the affiliation node:
    records_bf.add((person_affiliation_node, MADS.hasAffiliationAddress, person_affiliation_address_node))

    # return the finished affiliation node with all its children and attached strings:
    return person_affiliation_node

# the full function that creates a contribution node for each person in AUP:
# first, get all AUPs in a record and create a blank node for each of them
def add_bf_contributor_person(work_uri, record):
    # initialize a counter for the contribution position and a variable for the contribution qualifier:
    contribution_counter = 0
    contribution_qualifier = None
    
    for person in record.findall("AUP"):
        # count how often we've gone through the loop to see the author position:
        contribution_counter += 1
        # make a blank node for the bf:Contribution:
        contribution_node = BNode()
        # contribution_node = URIRef(work_uri + "/contribution/" + str(contribution_counter))
        records_bf.add((contribution_node, RDF.type, BF.Contribution))
        
        # make a blank node for the person:
        person_node = BNode()
        records_bf.add((person_node, RDF.type, BF.Person))

        # add the counter as an author position to the contribution node:
        records_bf.add((contribution_node, PXP.contributionPosition, Literal(contribution_counter)))

        # if we are in the first loop, set "contrution_qualifier" to "first":
        if contribution_counter == 1:
            contribution_qualifier = "first"
            records_bf.add((contribution_node, RDF.type, BFLC.PrimaryContribution))
        # if we are in the last loop, set "contribution_qualifier" to "last":
        elif contribution_counter == len(record.findall("AUP")):
            contribution_qualifier = "last"
        # if we are in any other loop but the first or last, set "contribution_qualifier" to "middle":
        else:
            contribution_qualifier = "middle"

        # add the contribution qualifier to the contribution node:
        records_bf.add((contribution_node, BF.qualifier, Literal(contribution_qualifier)))

        # add the name from AUP to the person node, but only use the text before the first |: (and clean up the encoding):
        personname = mappings.replace_encodings(person.text.split("|")[0]).strip()

        records_bf.add((person_node, RDFS.label, Literal(personname)))        

        # initialize variables for later use:
        personname_normalized = None
        orcidId = None

        # split personname into first and last name:
        personname_split = personname.split(",")
        if len(personname_split) > 1:
            familyname = personname_split[0].strip()
            givenname = personname_split[1].strip()
            records_bf.add((person_node, SCHEMA.familyName, Literal(familyname)))
            records_bf.add((person_node, SCHEMA.givenName, Literal(givenname)))
            # generate a normalized version of familyname to compare with PAUP name later:
            personname_normalized = normalize_names(familyname,givenname)
            # for debugging, print the normalized name:
            # records_bf.add((person_node, PXP.normalizedName, Literal(personname_normalized)))

        # call the function match_paup to match the personname from AUP with the PAUPs:
        paId = match_paup(record, person_node, personname_normalized)
        if paId is not None:
            # create a blank node for the identifier:
            # we coulkd do this into the function, but then I will have to return something else
            psychauthors_identifier_node = BNode()
            records_bf.add((psychauthors_identifier_node, RDF.type, BF.Identifier))
            records_bf.add((psychauthors_identifier_node, RDF.type, BF.Local))
            records_bf.add((psychauthors_identifier_node, RDF.type, PXC.PsychAuthorsID))
            records_bf.add((psychauthors_identifier_node, RDF.value, Literal(paId)))
            # add the identifier node to the person node:
            records_bf.add((person_node, BF.identifiedBy, psychauthors_identifier_node))
            # create a urL from the paid and add it as a "webpage describing this entity" to the person node:
            psychauthors_url = "https://www.psychauthors.de/psychauthors/index.php?wahl=forschung&uwahl=psychauthors&uuwahl=" + paId
            records_bf.add((person_node, SCHEMA.mainEntityOfPage, URIRef(psychauthors_url)))
        
        # call the function get_orcid to match the personname with the ORCIDs in the record:
        orcidId = get_orcid(record, person_node, personname)
        if orcidId is not None:
            # create a blank node for the identifier:
            orcid_identifier_node = BNode()
            records_bf.add((orcid_identifier_node, RDF.type, BF.Identifier))
            records_bf.add((orcid_identifier_node, RDF.type, LOCID.orcid))
            records_bf.add((orcid_identifier_node, RDF.value, Literal(orcidId)))
            # add the identifier node to the person node:
            records_bf.add((person_node, BF.identifiedBy, orcid_identifier_node))
            # add the orcid id as a sameAs link to the person node:
            orcid_uri = "https://orcid.org/" + orcidId
            records_bf.add((person_node, SCHEMA.sameAs, URIRef(orcid_uri)))

        ## ----- 
        # Getting Affiliations and their countries from first, CS and COU (only for first author), and then from subfields |i and |c in AUP (for newer records)
        ## -----
        
        # initialize variables we'll need for adding affiliations and country names from AUP |i and CS/COU/ADR:
        affiliation_string = None
        affiliation_country = None
      
        # match affiliations in CS and COU to first contribution/author:
        # dont add ADR here yet (even if this is the place for it - we may drop that info anyway.
        # look for the field CS:
        # if the contribution_counter is 1 (i.e. if this is the first loop/first author), add the affiliation to the person node:
        if contribution_counter == 1:
            if record.find("CS") is not None:
                # get the content of the CS field:
                affiliation_string = html.unescape(mappings.replace_encodings(record.find("CS").text.strip()))

            if record.find("COU") is not None:
                # get the country from the COU field:
                affiliation_country = mappings.replace_encodings(sanitize_country_names(record.find("COU").text.strip()))

                
        ## Get affiliation from AUP |i, country from |c:
        # no looping necessary here, just check if a string |i exists in AUP and if so, add it to the person node:
        # if AUP contains "|i ", use anything after it and before the end of the string or before another "|" as the affiliation string:
        if person.text.find("|i ") > -1:
            # save that text in a variable:
            affiliation_string = html.unescape(mappings.replace_encodings(person.text.split("|i")[1].split("|")[0]).strip())
            # affiliation_string = replace_encodings(affiliation_string)
            
        # now check if there is a country in |c:
        if person.text.strip().find("|c ") > -1:
            # save that text in a variable:
            affiliation_country = mappings.replace_encodings(sanitize_country_names(person.text.strip().split("|c")[1].strip().split("|")[0].strip()))

        # pass this to function build_affiliation_nodes to get a finished affiliation node:
        if affiliation_string != "" and affiliation_string is not None:
            affiliation_node = build_affiliation_nodes(affiliation_string, affiliation_country)
            # add the affiliation node to the contribution node:
            records_bf.add((contribution_node, MADS.hasAffiliation, affiliation_node))

        # look for the field EMAIL:
        email = None
        # TODO: the email address actually belongs into the affiliation section, but we'll leave it directly in the contribution node for now:
        if record.find("EMAIL") is not None:
            # get the email address from the EMAIL field, replacing spaces with underscores (common problem in urls in star) and adding a "mailto:" prefix:
            email = html.unescape(mappings.replace_encodings(record.find("EMAIL").text.strip().replace(" ", "_")))
            # check if this is a valid email address:
            email_pattern = re.compile(r'^([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x22([^\x0d\x22\x5c\x80-\xff]|\x5c[\x00-\x7f])*\x22)(\x2e([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x22([^\x0d\x22\x5c\x80-\xff]|\x5c[\x00-\x7f])*\x22))*\x40([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x5b([^\x0d\x5b-\x5d\x80-\xff]|\x5c[\x00-\x7f])*\x5d)(\x2e([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x5b([^\x0d\x5b-\x5d\x80-\xff]|\x5c[\x00-\x7f])*\x5d))*$')
            # check if email matches the regex in email_pattern:
            if not email_pattern.match(email):
                print("invalid email address: " + email)
                # email = None
            email = "mailto:" + email
            # email = "mailto:" + record.find("EMAIL").text.strip()
            # if there is no EMID and the contribution_counter is 1 (i.e. if this is the first loop), add the email to the person node:
            if record.find("EMID") is None and contribution_counter == 1:
                records_bf.add((contribution_node, MADS.email, URIRef(email)))
            # else match the existing EMID field to the personname:
            elif record.find("EMID") is not None and mappings.replace_encodings(record.find("EMID").text.strip()) == personname:
                records_bf.add((contribution_node, MADS.email, URIRef(email))) 
                
            
        role = None
        # check if there is a role in the AUP field:
        if person.text.strip().find("|f ") > -1:
            # save that text in a variable:
            role = person.text.strip().split("|f")[1].strip().split("|")[0].strip()
            # add it to the contribution node:
            records_bf.add((contribution_node, BF.role, add_bf_contributor_person_role(role)))
        # if there isn't, the role is "AU" by default:
        else:
            records_bf.add((contribution_node, BF.role, add_bf_contributor_person_role("AU")))

        ## --- Add the contribution node to the work node:
        records_bf.add((work_uri, BF.contribution, contribution_node))
        # add the person node to the contribution node as a contributor:
        records_bf.add((contribution_node, BF.agent, person_node))    

## TODO: Function: Create Topics, Weighted Topics and Classifications from CT, SH

Maybe try lookup with Skosmos?

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:subject [a bf:Topic, pxc:WeightedTopic, skos:Concept; # # topic, weighted
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/35365>;
        rdfs:label "Ontologies"@en, "Ontologien"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    bf:subject [a bf:Topic, skos:Concept; # a non-weighted topic
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/60135>;
        rdfs:label "Semantic Networks"@en, "Semantische Netzwerke"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    # PSYNDEX subject heading classification
    bf:classification [ a bf:Classification, pxc:SubjectHeading, skos:Concept;
        rdfs:label "Professional Psychological & Health Personnel Issues"@en;
        bf:code "3400";
        owl:sameAs <https://w3id.org/zpid/vocabs/class/3400>;
        bf:source <https://w3id.org/zpid/vocabs/class>;
    ].
```

## TODO: Function: Create nodes for Population Age Group (AGE) and Population Location (PLOC)

Use this scheme:

```turtle
<Work> 
# age group study is about/sample was from:
    bflc:demographicGroup [a bflc:DemographicGroup, pxc:AgeGroup, skos:Concept;
        rdfs:label "Adulthood"@en, "Erwachsenenalter"@de;
        owl:sameAs <https://w3id.org/zpid/vocabs/age/adulthood>;
        bf:source <https://w3id.org/zpid/vocabs/age/AgeGroups>; 
    ];
    # population location: 
    bf:geographicCoverage [a bf:GeographicCoverage, pxc:PopulationLocation, skos:Concept;
        rdfs:label "Germany"@en;
        owl:sameAs <countries/ger>;
    ].
```

## Function: Create nodes for PRREG (linked Preregistration Works)

Field PRREG can occur multiple times per record (0..n). 
It contains a link and/or DOI to a preregistration document. 

Possible subfields:
- |u URL linking to the document
- |d DOI for the document
- |i additional Info text

There are many errors we could catch here. 
- [x] Most importantly, we can replace any " " with "_" in the |u.
- [x] Also, |d should contain pure DOIs with prefixes, so they should start with "10." If they don't, remove any prefixes to make a "pure" DOI.
- [x] remove or ignore any empty subfields that may exist (|u, |d, |i)

Example:

```turtle
<https://w3id.org/zpid/pub/work/0003> a bf:Work; 
    bflc:relationship 
    [
        a bflc:Relationship;
        bflc:relation relations:hasPreregistration;
        bf:note [a bf:Note; rdfs:label "Australian Sample"];
        bf:supplement # may change, not sure?
        [
            a bf:Work, bf:Text; 
            bf:genreForm genres:preregistration; 
            bf:content content:text;
            bf:hasInstance 
            [
                a bf:Instance;
                bf:electronicLocator <https://osf.io/prereg1>;
                bf:identifier [a bf:Identifier, bf:Doi; rdf:value "10.123code003"];
                # add bf:media "computer" from rda media types
                bf:media <http://rdvocab.info/termList/RDAMediaType/1003>;
                # bf:carrier "online resource" from rda vocabulary
                bf:carrier <http://rdvocab.info/termList/RDACarrierType/1018>;
            ]
        ] 
    ]
.
```

In [22]:
# function to build the nodes for preregistration links
from distutils.command import build


def get_bf_preregistrations(work_uri, record):
    # get the preregistration link from the field PREREG:
    preregistration_note = None
    unknown_field_content = None
    for prreg in record.findall("PRREG"):
        # get the full content of the field, sanitize it:
        prregfield = html.unescape(mappings.replace_encodings(prreg.text.strip()))
        # use our node-building function to build the node:
        relationship_node, instance = build_work_relationship_node(work_uri,relation_type="preregistration")
        doi_set = set()
        for subfield_name in ("u", "d"):
            try: 
                subfield = get_subfield(prregfield, subfield_name)
            except:
                subfield = None
            else:
                # print(subfield)
                # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
                # as a doi or url:
                # if it is a doi, run a function to generate a doi identifier node
                if check_for_url_or_doi(subfield)[1] == "doi":
                    # add the doi to a list:
                    doi_set.add(check_for_url_or_doi(subfield)[0])
                    #build_doi_identifier_node(instance, check_for_url_or_doi(subfield)[0])
                elif check_for_url_or_doi(subfield)[1] == "url":
                    build_electronic_locator_node(instance, check_for_url_or_doi(subfield)[0])
                    # if the returned typ is something else - "unknown", do nothing with it:
                else:
                    # print("bf:note > bf:Note > rdfs:label: " + subfield)
                    # build_note_node(instance, check_for_url_or_doi(subfield)[0])
                    if check_for_url_or_doi(subfield)[0] is not None and check_for_url_or_doi(subfield)[0] != "":
                        # add a variable 
                        unknown_field_content = check_for_url_or_doi(subfield)[0].strip()
                        print(f"unknown type: {unknown_field_content}. Adding as a note.")
                        # add the string as a note to the instance:
                        # build_note_node(instance, check_for_url_or_doi(subfield)[0])
        # now build the doi identifier nodes for any DOIs in the set we collected:
        for doi in doi_set:
            build_doi_identifier_node(instance, doi)
        # for the text in the |i subfield, build a note without further processing:
        try:
            preregistration_note = get_subfield(prregfield, "i")
        except:
            preregistration_note = None
        else:
            # add anything in the |i subfield as a note to the instance:
            # but if we found something unrecognizable in |u or |i, also add it to the note:
            if unknown_field_content is not None:
                build_note_node(instance, preregistration_note+ ". " + unknown_field_content )
            else:
                build_note_node(instance, preregistration_note)
        # now attach the finished node for the relationship to the work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))
      


        # add preregistration_node to work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))

## Function: Create nodes for Grants (GRANT)

Includes several helper functions that 
- extract grant numbers if several were listed in the |n subfield
- replace funder names that fundref usually doesn't match correctly or at all
- look up funder names in crossref's fundref api to get their fundref id (a doi)

In [23]:
def extract_grant_numbers(subfield_n_string):
    # this function takes a string and returns a list of award numbers
    # first, split the string on "," or ";" or "and": (first replacing all semicolons and "ands" with commas)")
    subfield_n_string = subfield_n_string.replace(" and ", ", ")
    subfield_n_string = subfield_n_string.replace(" und ", ", ")
    subfield_n_string = subfield_n_string.replace(" & ", ", ")
    subfield_n_string = subfield_n_string.replace(";", ",")
    subfield_n_string = subfield_n_string.split(", ")
    # in each of the returned list elements, remove any substrings that are shorter 
    # than 5 characters (to get rid of things like " for" or "KDL: " YG: " etc.)
    # for element in subfield_n_string:
    #     if len(element) < 5:
    #         subfield_n_string.remove(element)
    # go through all the list elements and replace each with a dict,
    # which has a key "grant_number" and a key "grant_name" (which is None for now):
    # for i, element in enumerate(subfield_n_string):
    #     subfield_n_string[i] = {"grant_number": element, "grant_name": None}
    # # return the list of dicts:
    return subfield_n_string

def replace_common_fundernames(funder_name):
    """This will accept a funder name that crossref api may not recognize, at least not as the first hit,
    and replace it with a string that will supply the right funder as the first hit"""
    # if the funder_name is in the list of funder names to replace (in index 0), then replace it with what is in index 1:
    for funder in mappings.funder_names_replacelist:
        if funder_name == funder[0]:
            funder_name = funder[1]
            # print("replacing " + funder[0] + " with " + funder[1])
    return funder_name

def get_crossref_funder_id(funder_name):
    # this function takes a funder name and returns the crossref funder id for that funder name
    # to do this, use the crossref api.
    funder_name = replace_common_fundernames(funder_name)
    # construct the api url:
    crossref_api_url = CROSSREF_API_URL + funder_name + CROSSREF_FRIENDLY_MAIL
    # make a request to the crossref api:
    # crossref_api_request = requests.get(crossref_api_url)
    # make request to api:
    crossref_api_request = session_fundref.get(
            crossref_api_url, timeout=20
    )
    crossref_api_response = crossref_api_request.json()
    # result_count = int(crossref_api_response["message"]["total-results"])
    # if the request was successful, get the json response:
    
    if crossref_api_request.status_code == 200 and \
    crossref_api_response["message"]["total-results"] >0: 
        # return the number of results:
        #print("Treffer: " + str(crossref_api_response["message"]["total-results"]))
        # return the first hit:
        # print("Erster Treffer: " + crossref_api_response["message"]["items"][0]["name"])
        # print("DOI: " + "10.13039" + crossref_api_response["message"]["items"][0]["id"])
       return "10.13039/" + crossref_api_response["message"]["items"][0]["id"]
    else:
        # retry the funder_name, but remove any words after the first comma:
        if funder_name.find(",") > -1:
            funder_name = funder_name.split(",")[0]
            return get_crossref_funder_id(funder_name)
        else:
            return None
    

# function to build the nodes for preregistration links
def get_bf_grants(work_uri, record):
    """this function takes a string and returns a funder (name and fundref doi), a list of grant number, a note with grant holder and info"""
    for grant in record.findall("GRANT"):
        # point zero: remove html entities from the field:
        grantfield = html.unescape(grant.text)
        # if the field contains these, skip it - don't even create a fundinfregerence node:
        if "projekt deal" in grantfield.lower() or "open access" in grantfield.lower():
            continue
    # point one: pipe all text in the field through the DD-Code replacer function:
        grantfield = mappings.replace_encodings(grantfield)
        # add a blank node for a new Contribution:
        funding_contribution_node = BNode()
        # records_bf.add((funding_contribution_node, RDF.type, BF.Contribution))
        records_bf.add((funding_contribution_node, RDF.type, PXC.FundingReference))
        # add a blank node for the funder agent:
        funder_node = BNode()
        records_bf.add((funder_node, RDF.type, BF.Agent))
        # add the funder agent node to the funding contribution node:
        records_bf.add((funding_contribution_node, BF.agent, funder_node))
        # add a role to the funding contribution node:
        records_bf.add((funding_contribution_node, BF.role, URIRef("http://id.loc.gov/vocabulary/relators/spn")))
        
    # first, use anything before the first "|" as the funder:
        # but because the database is still messy, use a default funder name in case there
        # is no name in the field:
        funder_name = "FUNDERNAME NOT FOUND"
        # funder = {"funder_name": grantfield.split("|")[0].strip(), "funder_id": None}
        funder_name = grantfield.split("|")[0].strip()
        # add the funder name to the funder node:
        records_bf.add((funder_node, RDFS.label, Literal(funder_name)))
        # try to look up this funder name in the crossref funder registry:
        # if there is a match, add the crossref funder id as an identifier:
        crossref_funder_id = None
        crossref_funder_id = get_crossref_funder_id(funder_name)
        if crossref_funder_id is not None:
            # add a blank node for the identifier:
            crossref_funder_id_node = BNode()
            # use our custim identifier class FundRefDoi (subclass of bf:Doi):
            records_bf.add((crossref_funder_id_node, RDF.type, PXC.FundRefDoi))
            records_bf.add((funder_node, BF.identifiedBy, crossref_funder_id_node))
            # add the crossref funder id as a literal to the identifier node:
            records_bf.add((crossref_funder_id_node, RDF.value, Literal(crossref_funder_id)))

        # then check the rest for a grant number:
        try:
        # if "|n " in grantfield:
            grants = grantfield.split("|n ")[1].split(" |")[0]
        except:
            grants = None
        else:
            grants = extract_grant_numbers(grants)
            # add the grant number to the funding contribution node:
            for grant_id in grants:
                # add a blank node for the grant (class pxc:Grant via pxp:grant)
                grant_node = BNode()
                records_bf.add((grant_node, RDF.type, PXC.Grant))
                # add the grant node to the funding contribution node:
                records_bf.add((funding_contribution_node, PXP.grant, grant_node))

                # add a blank node for the identifier:
                grant_identifier_node = BNode()
                # records_bf.add((grant_identifier_node, RDF.type, BF.Identifier))
                records_bf.add((grant_identifier_node, RDF.type, PXC.GrantId))
                records_bf.add((grant_identifier_node, RDF.value, Literal(grant_id.strip())))
                # add the identifier node to the grant node:
                records_bf.add((grant_node, BF.identifiedBy, grant_identifier_node))
        # then check the rest for a grant name or other info/note:
        try:
        # if "|i " in grantfield:
            funding_info = grantfield.split("|i ")[1].split(" |")[0]
        except:
            funding_info = None

        try:
        # if "|e " in grantfield:
            funding_recipient = grantfield.split("|e ")[1].split(" |")[0]
        except:
            funding_recipient = None
        else:
            # add an explanatory prefix text:
            funding_recipient = "Recipient(s): " + funding_recipient
            # add the funding_recipient to the funding_info (with a ". " separator), if that already contains some text, otherwise just use the funding_recipient as the funding_info:
            if funding_info is not None:
                funding_info = funding_info + ". " + funding_recipient
            else:
                funding_info = funding_recipient
        if funding_info is not None:
        # add the funding_info (with data from |i and |e to the funding contribution node as a bf:note:
            funding_info_node = BNode()
            records_bf.add((funding_info_node, RDF.type, BF.Note))
            records_bf.set((funding_info_node, RDFS.label, Literal(funding_info)))
            records_bf.add((funding_contribution_node, BF.note, funding_info_node))
        # add the funding contribution node to the work node:
        records_bf.add((work_uri, BF.contribution, funding_contribution_node))
        # return funding_contribution_node

# Function: Add Conference info from field CF



In [24]:
def get_bf_conferences(work_uri, record):
    # only use conferences from actual books (BE=SS or SM)
    # ignore those in other publication types like journal article
    if record.find("BE").text == "SS" or record.find("BE").text == "SM":
        for conference in record.findall("CF"):
            # get the text content of the CF field,
            # sanitize it by unescaping html entities and 
            # replacing STAR's ^DD encodings:
            conference_field = html.unescape(mappings.replace_encodings(conference.text.strip()))
            # try to get the conference name from the CF field:
            try: 
                # get conference_name from main CF field, using the first part before any |:
                conference_name = conference_field.split("|")[0].strip()
            except:
                conference_name = "MISSING CONFERENCE NAME"
            # then check the field for a date in apotential subfield |d:
            try:
                conference_date = conference_field.split("|d ")[1].split(" |")[0]
            except:
                conference_date = None
            else:
                # if there is a |d, add the full date to conference_note:
                conference_note = "Date(s): " + conference_date
                # extract the year from the date to use it as conference_year:
                # Anything with 4 consecutive digits anywhere in the date string is a year.
                # here is a regex for finding YYYY pattern in any string:
                year_pattern = re.compile(r'\d{4}')
                # if there is a year in the date string, use that as the date:
                if year_pattern.search(conference_date):
                    conference_year = year_pattern.search(conference_date).group()
                else:
                    conference_year = None
            # then check the field for a place in a potential subfield |o:
            try:
                conference_place = conference_field.split("|o ")[1].split(" |")[0]
            except:
                conference_place = None
            # then check for a note in a potential subfield |b, but 
            # remebering to keep what is already in conference_note:
            try:
                conference_note = conference_note + ". " + conference_field.split("|b ")[1].split(" |")[0]
            except:
                conference_note = conference_note
        
            # construct the node for the conference:
            # a bnode for the contribution/conferencereference:
            conference_reference_node = BNode()
            records_bf.add((conference_reference_node, RDF.type, PXC.ConferenceReference))
            # a blank node for the conference/meeting/agent:
            conference_node = BNode()
            records_bf.add((conference_node, RDF.type, BF.Meeting))
            # attach the agent to the contribution/conferencereference:
            records_bf.add((conference_reference_node, BF.agent, conference_node))
            # add the conference name as a label to the agent/meeting node:
            records_bf.add((conference_node, RDFS.label, Literal(conference_name)))
            # add the year as a bflc:simpleDate to the agent/meeting node:
            records_bf.add((conference_node, BFLC.simpleDate, Literal(conference_year)))
            # add the place as a bflc:simplePlace to the agent/meeting node:
            records_bf.add((conference_node, BFLC.simplePlace, Literal(conference_place)))
            # add the note as a bf:note to the agent/meeting node, first adding a bnode for the bf:Note:
            conference_note_node = BNode()
            # make it a bf:Note:
            records_bf.add((conference_note_node, RDF.type, BF.Note))
            # add the note to the note node as a literal via rdfs:label:
            records_bf.add((conference_note_node, RDFS.label, Literal(conference_note)))
            # add a bf:role <http://id.loc.gov/vocabulary/relators/ctb> to the ConferenceReference ("contributor" - which is the default for conferences in DNB and LoC):
            records_bf.add((conference_reference_node, BF.role, URIRef("http://id.loc.gov/vocabulary/relators/ctb")))
            # add the note node to the agent/meeting node via bf:note:
            records_bf.add((conference_reference_node, BF.note, conference_note_node))
            # add the conference node to the work node:
            records_bf.add((work_uri, BF.contribution, conference_reference_node))

## Functions: Add Research Data Link from DATAC and URLAI

Field URLAI should only hold a doi for a psychdata dataset (these are usually, or rather, always, restricted access). This field always has a full doi link, in various prefix formats. We remove the prefix and only keep the pure doi. 

Field DATAC has either a subfield |u with a regular url link or a subfield |d with a doi (or both).
The doi in DATAC is usually a pure doi, without any prefixes. But sometimes it's not! 

Since the data is so dirty, we make our own classficiation: we run all subfields, no matter declared Doi or URL (so from |u or |d) through our own recognition tree:
- anything that is a pure doi (starts with "10.") will be saved as a DOI (bf:identifiedBy > bf:Doi)
- so will anything with a pseudo url that is in reality just a DOI with a prefix (like "https://doi.org/10.1234/5678")
- anything that seems a regular URL (with a DOI inside) will be declared a URL (bf:electronicLocator)
- anything that is neither of the above will be ignored (or copied into a note)

So we just check all subfields, see if they contain a DOI in any form, and keep that. And then check for other urls and keep those as electroniclocators (but only if they don't contain the DOI again?)

In Bibframe, Research Data is modeled as a bnode bf:Work with a bf:Instance that has a bf:electronicLocator for a URL and a bf:identifiedBy for the Doi. This Work is in a bflc:Relationship to the study'S work, and the general relatedTo-subproperty should be bf:supplement. We also define a skos bflc:Relation "hasResearchData" to use as the bflc:Relationship's bflc:relation.

Research Data can be (or rather, contain) either Code or a DataSet, or both. We can use the bf:genreForm to distinguish between the two, and also the Work subclass (bf:Dataset, bf:Multimedia).

We also want to add the information whether the data is restricted or open access. We can do this in Bibframe with [bf_usageAndAccessPolicy](http://id.loc.gov/ontologies/bibframe/usageAndAccessPolicy) and [bf:AccessPolicy](http://id.loc.gov/ontologies/bibframe/AccessPolicy) on the Data's Instance (it is what the LoC instance-bf2marc excel table does. This info is based on MARC21 field 506).

According to the github repo of the conversion script, it should look like this:

```turtle
<Instance> bf:usageAndAccessPolicy [
    a bf:AccessPolicy;
    rdfs:label "open access"@en, "offener Zugang"@de;
    # or:
    # rdfs:label "restricted access"@en, "eingeschränkter Zugang"@de;
    rdf:value "http://purl.org/coar/access_right/c_abf2"^^xsd:anyURI; # a link to the license or uri of the skos term: here: open access
    # or:
    # rdf:value "http://purl.org/coar/access_right/c_16ec"^^xsd:anyURI; # restricted 
].
```

To be able to use a controlled vocabulary for this, we will make use of the COAR "access rights" skos vocabulary!
https://vocabularies.coar-repositories.org/access_rights/ - its four concepts: open access, restriced access, embargoed access, metadata only access. 

In [25]:
def get_urlai(work_uri, record):
    """Gets research data from field URLAI. This is always in PsychData, so it will be restricted access by default.
    We will also assume it to always be just research data, not code.
    """
    for data in record.findall("URLAI"):
        urlai_field = mappings.replace_encodings(data.text.strip())
        doi_set = set()
        #build the relationship node:
        relationship_node, instance = build_work_relationship_node(work_uri, relation_type="rd_restricted_access") 
        # there are no subfields in urlai, so let's just grab the whole thing and pass it on to the url or doi checker:
        # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
        # as a doi or url:
        # if it is a doi, run a function to generate a doi identifier node
        if check_for_url_or_doi(urlai_field)[1] == "doi":
            # build_doi_identifier_node(instance,check_for_url_or_doi(urlai_field)[0])
            doi_set.add(check_for_url_or_doi(urlai_field)[0])
        elif check_for_url_or_doi(urlai_field)[1] == "url":
            build_electronic_locator_node(instance, check_for_url_or_doi(urlai_field)[0])
        # if the returned typ is something else "unknown", do nothing with it:
        else:
            # print("bf:note > bf:Note > rdfs:label: " + urlai_field)
            build_note_node(instance, check_for_url_or_doi(urlai_field)[0])

        # loop through the set to build doi nodes, so we won't have duplicates:
        for doi in doi_set:
            build_doi_identifier_node(instance, doi)
        # now attach the finished node for the relationship to the work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))

In [26]:
def get_datac(work_uri, record):
    """Gets research data from field DATAC, adds a Relationship node to the work.
Note: We define all data from this field as type "research data only, no code", and "open/unrestricted access"
Newer data from PSYNDEXER may be something else, but for first migration, we assume all data is research data only.
"""
    # go through the list of datac fields and get the doi, if there is one:
    for data in record.findall("DATAC"):
        datac_field = mappings.replace_encodings(data.text.strip())
        # print(datac_field)
        # add an item "hello" to the set:
        #build the relationship node:
        relationship_node, instance = build_work_relationship_node(work_uri, relation_type="rd_open_access") 
        # we want to drop any duplicate dois that can occur if datac has a doi and doi url (same doi, but protocol etc prefixed) 
        # for the same data that,
        # after conversion, ends up being identical. So we make a set of dois,
        # which we will add dois to, and then later loop through the set (sets are by defintion list with only unique items!):
        doi_set = set()
        # grab subfields u and d as strings and check if they are a url or a doi:
        for subfield_name in ("u", "d"):
            try: 
                subfield = get_subfield(datac_field, subfield_name)
            except:
                subfield = None
            else:
                # print(subfield)
                # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
                # as a doi or url:
                # if it is a doi, run a function to generate a doi identifier node
                if check_for_url_or_doi(subfield)[1] == "doi":
                    # add the doi to a list:
                    doi_set.add(check_for_url_or_doi(subfield)[0])
                    #build_doi_identifier_node(instance, check_for_url_or_doi(subfield)[0])
                elif check_for_url_or_doi(subfield)[1] == "url":
                    build_electronic_locator_node(instance, check_for_url_or_doi(subfield)[0])
                    # if the returned typ is something else "unknown", do nothing with it:
                else:
                    # print("bf:note > bf:Note > rdfs:label: " + subfield)
                    build_note_node(instance, check_for_url_or_doi(subfield)[0])
        for doi in doi_set:
            build_doi_identifier_node(instance, doi)
        # now attach the finished node for the relationship to the work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))

# The Loop!
## Creating the Work and Instance uris and adding other triples via functions

### Uris and types for Bibframe profile

We want two URIs, since we split the Records into (at first) one work and one instance, which will be linked together.
We also say one will be a (rdf:type) bf:Work and the other bf:Instance.
Then we print all these triples into a file for the bibframe profile.

In [27]:
# print(len(root.findall("Record")))


record_count = 0
for record in root.findall("Record"):
    # get the count of this record:
    record_count += 1
    # create a named graph dataset for each "record":
    # Create an empty Dataset
    # d = Dataset()
    # Add a namespace prefix to it, just like for Graph
    # d.bind("ex", Namespace("http://example.com/"))
    # Declare a Graph URI to be used to identify a Graph
    # graph_1 = URIRef("http://example.com/graph/" + dfk + "/")
    # Add an empty Graph, identified by graph_1, to the Dataset
    #d.graph(identifier=graph_1)
    



    # get the DFK identifier from the record:
    dfk = record.find("DFK").text

    # create a URI for the work and the instance and give them their correct bf classes:
    work_uri = WORKS[dfk]
    records_bf.add((work_uri, RDF.type, BF.Work))
    #d.add((work_uri, RDF.type, BF.Work, graph_1))

    # create a URI for the instance:
    # instance_uri = INSTANCES[dfk]

    # for first, nested migration,
    # create a blank node for the instance:
    instance_uri = BNode()
    records_bf.add((instance_uri, RDF.type, BF.Instance))
    #d.add((instance_uri, RDF.type, BF.Instance, graph_1))

    # connect work and instance via bf:instanceOf and bf:hasInstance:
    records_bf.add((instance_uri, BF.instanceOf, work_uri))
    records_bf.add((work_uri, BF.hasInstance, instance_uri))
    #d.add((instance_uri, BF.instanceOf, work_uri, graph_1))
    #d.add((work_uri, BF.hasInstance, instance_uri, graph_1))

    # add an identifier bnode to the work using a function:
    records_bf.add((instance_uri, BF.identifiedBy, get_bf_identifier_dfk(instance_uri, dfk)))
    #d.add((instance_uri, BF.identifiedBy, get_bf_identifier_dfk(instance_uri, dfk), graph_1))

    # get field TI and add as title node:
    records_bf.add((instance_uri, BF.title, get_bf_title(instance_uri, record)))
    #d.add((instance_uri, BF.title, get_bf_title(instance_uri, record), graph_1))

    # get work language from LA
    records_bf.add((work_uri, BF.language, get_work_language(record)))
    #d.add((work_uri, BF.language, get_work_language(record), graph_1))

    # get TIUE field and add as translated title node:
    # but only if the field exists!
    if record.find("TIUE") is not None and record.find("TIUE").text != "":
        records_bf.add((instance_uri, BF.title, get_bf_translated_title(instance_uri, record)))
        #d.add((instance_uri, BF.title, get_bf_translated_title(instance_uri, record), graph_1))

    # get and add contributors:
    # records_bf.add((work_uri, BF.contribution, add_bf_contributor_person(record)))
    add_bf_contributor_person(work_uri, record)
    # get toc, if it exists:
    # get_bf_toc(work_uri, record)
    
    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABH") is not None:
        get_bf_abstract(work_uri, record)
        # records_bf.add((work_uri, BF.summary, get_bf_abstract(work_uri, record)))
        #d.add((work_uri, BF.summary, get_bf_abstract(work_uri, record), graph_1))

    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABN") is not None:
        records_bf.add((work_uri, BF.summary, get_bf_secondary_abstract(work_uri, record)))
       # d.add((work_uri, BF.summary, get_bf_secondary_abstract(work_uri, record), graph_1))

    # get and add preregistration links:
    get_bf_preregistrations(work_uri, record)

    # get and add grants by using the returned set of nodes and adding it to the work:
    # open_science.get_bf_grants_module(work_uri, record)
    get_bf_grants(work_uri, record)
    
    #get and add conferences:
    get_bf_conferences(work_uri, record) # adds the generated bfls:Relationship node to the work

    get_datac(work_uri, record) # adds the generated bfls:Relationship node to the work
       
    
    # get book mediacarrier and add as secondary instance class:
    #split_books(instance_uri, record)

    # Serialize the Dataset to a file.
    #d.serialize(destination="ttl-data/" + dfk + ".jsonld", format="json-ld", auto_compact=True)


# add a Literal for the count of records:
# records_bf.add((URIRef("https://w3id.org/zpid/bibframe/records/"), BF.count, Literal(record_count)))
# and add it to the graph:
# first, add a bnode of class bf:AdminMetadata to the graph:
records_bf_admin_metadata_root = BNode()
records_bf.add((records_bf_admin_metadata_root, RDF.type, BF.AdminMetadata))
# add this bnode to the graph:
records_bf.add((URIRef("https://w3id.org/zpid/bibframe/records/"), BF.adminMetadata, records_bf_admin_metadata_root))
# # add a bf:generationProcess to the admin metadata node:
# records_bf.add((records_bf_admin_metadata_root, BF.generationProcess, Literal("Converted from PsychAuthors XML to BIBFRAME 2.2 using Python scripts")))
# # add a bf:generationDate to the admin metadata node:
# #records_bf.add((records_bf_admin_metadata_root, BF.generationDate, Literal(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))))
# # add the count as BF.count:
records_bf.add((records_bf_admin_metadata_root, PXP.recordCount, Literal(record_count)))


print(record_count, "records")

# print all the resulting triples:
records_bf.serialize("ttl-data/bibframe_records.ttl", format="turtle")
#records_bf.serialize("ttl-data/bibframe_records.jsonld", format="json-ld", auto_compact=True, sort_keys=True, index=True)
# serialize as xml
records_bf.serialize("ttl-data/bibframe_records.xml", format="pretty-xml")

#testwork = "https://w3id.org/resources/works/0401567"
#records_bf[testwork].serialize("ttl-data/bibframe_sample.jsonld", format="json-ld")

print(len(records_bf), "triples")


invalid email address: roman_stengelin@eva.mpg.de;_schleihauf@berkeley.edu
invalid email address: roman_stengelin@eva.mpg.de;_schleihauf@berkeley.edu
invalid email address: roman_stengelin@eva.mpg.de;_schleihauf@berkeley.edu
invalid email address: roman_stengelin@eva.mpg.de;_schleihauf@berkeley.edu
700 records
103241 triples


### Uris and types for simplified profile (schema-org)

For the simplified profile, we only need one entity per record (for now) and we give it the class schema:CreativeWork.
Then we print the resulting triples into a separate file for the simplified profile that mostly uses schema.org properties and classes.

In [28]:


# print(len(root.findall("Record")))

# for record in root.findall("Record"):
#     # get the DFK identifier from the record:
#     dfk = record.find("DFK").text

#     # create a URI for the work by attaching the dfk to the works namespace and 
#     # then give it the correct schema.org class:
#     work_uri = WORKS[dfk]
#     records_schema.add((work_uri, RDF.type, SCHEMA.CreativeWork))

#     # get work language from LA
#     records_schema.add((work_uri, SCHEMA.inLanguage, get_work_language(record)))


# records_schema.serialize("ttl-data/schema_records.jsonld", format="json-ld")
# # records_schema.serialize("ttl-data/schema_records.ttl", format="turtle")
# print(len(records_schema), "triples")