# Star2BF - Star to Bibframe

Import libraries:

In [195]:
from rdflib import Graph, Literal
from rdflib.namespace import RDF, RDFS, Namespace
# from rdflib.namespace import SCHEMA, XSD
from rdflib import BNode
from rdflib import URIRef
import xml.etree.ElementTree as ET
import re

Create an "element tree" from the records in my xml file so we can loop through them and do things with them:

In [196]:
# root = ET.parse("xml-data/records-440.xml")
# root = ET.parse("xml-data/records-322.xml")
# root = ET.parse("xml-data/records-395.xml")
# root = ET.parse("xml-data/records-214.xml")
root = ET.parse("/home/tina/Developement/psyndex-workflows/star-to-rdf/data/230424_000956/xml/records-556.xml")

# To see the source xml's structure, uncomment this function:
# def print_element(element, depth=0):
#     print("\t"*depth, element.tag, element.attrib, element.text)
#     for child in element:
#         print_element(child, depth+1)

# for child in root.getroot()[:2]:
#     print_element(child)


We first set a few namespace objects for bibframe, schema.org and for our resources (the works and instances) 
themselves.

Then, we create two graphs from the xml source file, one to generate triples for our bibframe profile output, and the other for the simplified schema.org profile. 

Finally, we bind the prefixes with their appropriate namespaces to the graphs.

In [197]:
BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
BFLC = Namespace("http://id.loc.gov/ontologies/bflc/")
MADS = Namespace("http://www.loc.gov/mads/rdf/v1#")
SCHEMA = Namespace("https://schema.org/")
WORKS = Namespace("https://w3id.org/zpid/resources/works/")
INSTANCES = Namespace("https://w3id.org/zpid/resources/instances/")
PXC = Namespace("https://w3id.org/zpid/ontology/classes/")
PXP = Namespace("https://w3id.org/zpid/ontology/properties/")
LANG = Namespace ("http://id.loc.gov/vocabulary/iso639-2/")
LOCID = Namespace("http://id.loc.gov/vocabulary/identifiers/")
ROLES = Namespace("https://w3id.org/zpid/vocabs/roles/")

# graph for bibframe profile:
records_bf = Graph()

kerndaten = Graph()
kerndaten.parse("ttl-data/kerndaten.ttl", format="turtle")    
# we need a new graph for the schema.org profile, so it won't just reuse the old triples from the other profile
# records_schema = Graph()

# Bind the namespaces to the prefixes we want to see in the output:
records_bf.bind("bf", BF) 
records_bf.bind("bflc", BFLC) 
records_bf.bind("works", WORKS)  
# records_schema.bind("works", WORKS) 
records_bf.bind("instances", INSTANCES) 
records_bf.bind("pxc", PXC) 
records_bf.bind("pxp", PXP) 
records_bf.bind("lang", LANG) 
records_bf.bind("schema", SCHEMA) 
records_bf.bind("locid", LOCID) 
records_bf.bind("mads", MADS) 
records_bf.bind("roles", ROLES) 

# records_schema.bind("instances", INSTANCES) 
# todo: find out why the output uses "schema1" instead of "schema" for the schema.org namespace:
# records_schema.bind("schema", SCHEMA, override=True) 


# Functions to do all the things

We need functions for the different things we will do - to avoid one long monolith of a loop.

This is where they will go. Examples: Create blank nodes for Idebtifiers, create nested contribution objects from disparate person entries in AUP, AUK, CS and COU fields, merge PAUP (psychauthor person names and ids) with the person's name in AUP...

These functions will later be called at the bottom of this notebook, in a loop over all the xml records.

## TODO: Function: Replace weird characters with unicode



In [198]:
def replace_encodings(text):
    cases = [
        ("&amp;", "&"),
        ("&lt;", "<"),
        ("&gt;", ">")
        ("^D$e", "€"),
        ("^D#&gt;", "≥"),
        ("^DEL", "…"),
        ("^DIF", "∞"),
        ("^D#=D", "≠"),
        ("^DDS", "-")
        # lots more where this came from...
    ]
    for case in cases:
        text = text.replace(case[0], case[1])   
    pass

  ("&gt;", ">")


## Function: Adding DFK as an Identifier

### DFK as id for Bibframe

We want to add the DFK as a local bf:Identifier to the work (or instance?). 
We also want to say where the Identifier originates (to say it is from PSYNDEX/ZPID). 

The format for that is:
```turtle
<Work/Instance> bf:identifiedBy [
    a bf:Local, pxc:DFK; 
    rdf:value "1234456"; 
    bf:source [
        a bf:Source; bf:code "ZPID.PSYNDEX.DFK"
    ]
];
```

So, we need a blank node for the Identifier and inside, another nested bnode for the bf:Source. This is a function that will return such an identifier bnode to add to the work_uri. We are calling it way up down below in the loop:

In [199]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_identifier_dfk(instance_uri, dfk):
    # make a  BNODE of the Identifier class from the BF namespace:
    # identifier = BNode()
    identifier = URIRef(instance_uri + "/identifier/dfk")
    identifier_source = BNode()
    # records_bf.add ((identifier, RDF.type, BF.Identifier))
    records_bf.add ((identifier, RDF.type, BF.Local))
    records_bf.add ((identifier, RDF.type, PXC.DFK))
    # build the source node:
    records_bf.add((identifier_source, RDF.type, BF.Source))
    records_bf.add((identifier_source, BF.code, Literal("ZPID.PSYNDEX.DFK")))

    # hang the id source node into the id node:
    records_bf.add((identifier, BF.source, identifier_source))
    records_bf.add((identifier, RDF.value, Literal(dfk)))
    return (identifier)

## Generic Function: Replace languages with their language tag

Can be used for different fields that are converted to langstrings or language uris. Use within other functions that work with the languages in different fields.

Returns an array with two values: a two-letter langstring tag at [0] and a three-letter uri code for the library of congress language vocab at [1].

In [200]:
def get_langtag_from_field(langfield):
    # when passed a string from any language field in star, returns an array with two items. 
    # Index 0: two-letter langstring tag, e.g. "de"
    # Index 1: two-letter iso langtag, e.g. "ger"
    # can be used on these fields (it contains the different spellings found in them):
    # "LA", "LA2", "TIL", "TIUL", "ABLH", "ABLN", "TIUE |s"
    match langfield:
        case "german" | "de" | "GERM" | "Deutsch" | "GERMAN" | "GERMaN" | "German" | "Fi":
            return ["de", "ger"]
        case "en" | "ENGL" | "ENGLISH" | "Englisch" | "English" | "English; English" | "english" :
            return ["en", "eng"]
        case "BULG" | "Bulgarian":
            return ["bg", "bul"]
        case "SPAN"| "Spanish":
            return ["es", "spa"]
        case "Dutch":
            return ["nl", "dut"]
        case "CZEC":
            return ["cs", "ces"]
        case "FREN" | "French":
            return ["fr", "fra"]
        case "ITAL" | "Italian":
            return ["it", "ita"]
        case "PORT" | "Portuguese":
            return ["pt", "por"]
        case "JAPN" | "Japanese":
            return ["jp", "jpn"]
        case "HUNG":
            return ["hu", "hun"]
        case "RUSS" | "Russian":
            return ["ru", "rus"]
        case "NONE" | "Silent":
            return ["zxx", "zxx"]
        case _:
            return ["und", "und"] # for "undetermined!"

# ---
# these are also in those fields, but they are errors that should be repaired before migration!
# X$English 
# EnglishX$
# EnglishX$X$
# $English
 
# $German 
# GermanX$X$
# X$$German
# X$$GermanX$$German
# GermanX$
# GermanX$$EnglishX$$English
# GermanX$$EnglishX$$EnglishX$$English
# GermanX$English 
# GermanX$English X$English 
# GermanX$English X$English X$English 
# GermanX$English X$EnglishX$EnglishX$English
# GermanX$English; English
# GermanX$EnglishX$English
# GermanX$EnglishX$EnglishX$English
# GermanX$EnglishX$EnglishX$EnglishX$EnglishX$English
# Fi (aus TIUL) - während TIL "German" ist! Das Dokument ist auch eindeutig Deutsch.

# 4 (aus LA2) -> kann gelöscht werden, das Dokument ist in Deutsch und hat keine Zweitsprache!
# Q (aus LA2) -> kann gelöscht werden, das Dokument ist in Deutsch und hat keine Zweitsprache! 


## Function: Get work language from LA

Example

```turtle
@prefix lang: <http://id.loc.gov/vocabulary/iso639-2/> .
<W> bf:language lang:ger .
```

Calls the generic language code lookup function above, get_langtag_from_field, passing the LA field content, returning a uri from the library of congress language vocabulary (built from namespace + 3-letter iso code). 

In [201]:
# function 
def get_work_language(record):
    work_language = get_langtag_from_field(record.find("LA").text.strip())[1]
    work_lang_uri = LANG[work_language]
    return (work_lang_uri)

## Function: Create Instance Title nodes from fields TI, TIU, TIL, TIUE...

Titles and Translated titles are attached to Instances. Translated titles also have a source, which can be DeepL, ZPID, or Original.

Example:

```turtle
<Instance> bf:title 
        [a bf:Title; 
            bf:mainTitle "Disentangling the process of epistemic change"@en;
            bf:subtitle "The role of epistemic volition"@en;
        ],
        [a pxc:TranslatedTitle;
            rdfs:label "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:mainTitle "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:adminMetadata  [ 
                a bf:AdminMetadata ;
                bflc:metadataLicensor  "DeepL";
        ]
        ].
```

- [x] add TI as bf:Title via bf:mainTitle
- [x] add subtitle from TIU
- [x] create a concatenated rdfs:label from TI and TIU
- [x] add languages for maintitle and subtitle (from TIL and TIUL)

- [x] add translated title from TIUE as pxc:TranslatedTitle with bf:mainTitle and rdfs:label 
- [x] add languages for translated title (from subfield TIU |s, or if unavailable, decide language based on TIL language: if de -> en and vice versa) 
- [x] find a way to create a source for the translated title (from "(DeepL)" at the end)

In [202]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_title(instance_uri, record):
    # make a  BNODE for the title:
    # title = BNode()
    title = URIRef(instance_uri + "/title")
    # make it bf:Title class:
    records_bf.add ((title, RDF.type, BF.Title))

    # get the content of th TI field as the main title:
    maintitle = record.find("TI").text.strip()
    # write a full title for the rdfs:label 
    # (update later if subtitle exists to add that)
    fulltitle = maintitle
    # set dafault language for main title:
    maintitle_language = "en"
    subtitle_language = "en"
    # get language of main title - if exists!:
    if record.find("TIL") is not None:
        maintitle_language = get_langtag_from_field(record.find("TIL").text.strip())[0]
        # if maintitle_language_til == "German":
        #     maintitle_language = "de"
        # else: just keep the default set above: "en"
    # get language of subtitle:
    if record.find("TIUL") is not None:
        subtitle_language = get_langtag_from_field(record.find("TIUL").text.strip())[0]
        # subtitle_language_tiul = record.find("TIUL").text.strip()
        # if subtitle_language_tiul == "German":
        #     subtitle_language = "de"
        # else: just keep the default set above: "en"

    # add the content of TI etc via bf:mainTitle:
    records_bf.add((title, BF.mainTitle, Literal(maintitle, lang=maintitle_language)))
    # get content of the TIU field as the subtitle, 
    # _if_ it exists and has text in it:
    if record.find("TIU") is not None and record.find("TIU") != "":
        subtitle = record.find("TIU").text.strip() # remove extraneous spaces
        # concatenate a full title from main- and subtitle, 
        # separated with a : and overwrite fulltitle with that
        fulltitle = fulltitle + ": " + subtitle
        # add the content of TIU to the bf:Title via bf:subtitle:
        records_bf.add((title, BF.subtitle, Literal(subtitle, lang=subtitle_language)))

    # add the concatenated full title to the bf:Title via rdfs:label:
    # (we don't care if the main title's and subtitle's languages don't match - we just set the language of the main title as the full title's language)
    records_bf.add((title, RDFS.label, Literal(fulltitle, lang=maintitle_language)))

    # # hang the id source node into the id node:
    # records_bf.add((identifier, BF.source, identifier_source))
    return (title)

# function for the translated title:
def get_bf_translated_title(instance_uri, record):
    # translated_title = BNode()
    translated_title = URIRef(instance_uri + "/title/translated")
    records_bf.add ((translated_title, RDF.type, PXC.TranslatedTitle))
    fulltitle = record.find("TIUE").text.strip()
    fulltitle_language = "de"
    # find a way to read subfield |s to get the actual language. 
    # it that doesn't exist, use the inverse of TIL!
    # if fulltitle string ends with "|s " followed by some text (use a regex):
    match = re.search(r'^(.*)\s\|s\s(.*)', fulltitle)
    if match:
        fulltitle = match.group(1).strip()
        fulltitle_language = get_langtag_from_field(match.group(2).strip())[0]
    else:
        # get the language in TIUE, if that field exists
        if record.find("TIL") is not None:
            original_title_language_til = get_langtag_from_field(record.find("TIL").text.strip())[0]
            
            # if it is German -> use inverse: "en"
            if original_title_language_til == "de":
                fulltitle_language = "en"
            # else -> keep "de"

    # check if the title contains a "(DeepL)" and cut it into a variable for the source:
    titlesource = "ZPID" # translation source is "ZPID" by default
    # note: we might be able to add source "Original" by finding out 
    # if the source of the secondary abstract is something other than ZPID!
    match_source = re.search(r'^(.*)\((DeepL)\)$', fulltitle)
    if match_source:
        fulltitle = match_source.group(1).strip()
        titlesource = match_source.group(2)

    # build a source node for the translation:
    titlesource_node = BNode ()
    records_bf.add ((titlesource_node, RDF.type, BF.AdminMetadata))
    records_bf.add ((titlesource_node, BFLC.metadataLicensor, Literal(titlesource)))

    # add the title string to the bnode:
    records_bf.add((translated_title, BF.mainTitle, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, RDFS.label, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, BF.adminMetadata, titlesource_node))

    return (translated_title)


## Function: Add Abstracts - original abstract (from fields ABH, ABLH, ABSH1, ABSH2) and translated/secondary abstract (from ABN, ABLN, ASN1, ASN2)

- Main Abstract: 
    - abstract text is in field ABH.
    - abstract language is in ABLH ("German" or "English")
    - abstract original source is in ASH1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASH2 ()
- Secondary Abstract 
    - abstract text is in field ABN.
    - abstract language is in ABLN ("German" or "English")
    - abstract original source is in ASN1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASN2 ()

Scheme:

```turtle
<W> bf:summary 
    [ a pxc:Abstract , bf:Summary ;
        rdfs:label  "Background: Loneliness is ..."@en ;
        bf:adminMetadata  [ 
            a bf:AdminMetadata ;
            bflc:metadataLicensor  "Original";
            bf:descriptionModifier "ZPID"
        ]
] .
```

In [203]:
# function to get the original abstract:
def get_bf_abstract(work_uri, record):
    # abstract = BNode()
    abstract = URIRef(work_uri + "/abstract")
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    # get abstract text from ABH
    abstracttext = record.find("ABH").text.strip()
    # get abstract language from ABLH ("German" or "English")
    abstract_language = "en" # set default
    if record.find("ABLH") is not None:
        abstract_language = get_langtag_from_field(record.find("ABLH").text.strip())[0]

    # add the text to the bnode:
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))

    # get abstract original source from ASH1 ("Original" or "ZPID")
    abstract_source = "Original" # default
    # create a blank node for admin metadata:
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))

    if record.find("ASH1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        abstract_source = record.find("ASH1").text.strip()
    
    # write final source text into source node:
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # here is a list of known zpid employee tags, we will use them later to replace these with "ZPID" if found in ASH2:

    # and this is a list of things we want to replace with "Original":
    

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASH2") is not None:
        # note what we find in ABSH2:
        abstract_editor = record.find("ASH2").text.strip()
        # todo: replace known zpid person initials with "ZPID"
        # "Juergen Wiesenhuetter",
        # "Joachim H. Becker","Udo Wolff", "Juergen Beling", 
        # "Joachim H. Mueller", "Angelika Zimmer", "Annelie Wiertz", "Beate Minsel", "Berndt Zuschlag",  "Doris Lecheler", "Elke Bone", "Guenter Krampen", "Hella Lenders", "Jutta Rohlmann", "Juergen Howe", "Manfred Opitz", "Manfred Fischer", "Paul Klein", "Sigrun-Heide Filipp", "Thomas W. Franke", "Ulrike Fischer", "Yrla M. Labouvie", 
        # "K.Si", "L.F.T.", "M.G.", "I.D." , "A.Bi.", "A.G.", "A.C.", "U.R.W", "U", "C.Si", "pe.k", "r", "R.N", "Ve.K.",   

        # if "Author" or "Autor" -> "Original"
        # and what if "DeepL"???
        # or "FIS Bildung", "GESIS Fachinformation für die Sozialwissenschaften, Bonn", "Kriminologische Zentralstelle", 
        # and add it via decription modifier:
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))


    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    return (abstract)

def get_bf_secondary_abstract(work_uri, record):
    # abstract = BNode()
    abstract = URIRef(work_uri + "/abstract/secondary")
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    records_bf.add ((abstract, RDF.type, PXC.SecondaryAbstract))
    abstracttext = record.find("ABN").text.strip()
    
    abstract_language = "de" # fallback default
    if record.find("ABLN") is not None:
        abstract_language = get_langtag_from_field(record.find("ABLN").text.strip())[0]
    
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))
    
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))
    abstract_source = "Original" # fallback default
    if record.find("ASN1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        abstract_source = record.find("ASN1").text.strip()
    
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASN2") is not None:
        # note what we find in ABSN2:
        abstract_editor = record.find("ASN2").text.strip()
        # and add it via decription modifier:
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))

    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    return (abstract)




## TODO: Function to split Table of Content from the Abstract field (ABH)

This usually starts with " - Inhalt: " (for German Abstracts) or " - Contents: " (in English abstracts) and ends at the end of the field.
It can contain a numbered list of chapters or sections as a long string. It can also contain a uri from dnb namespace instead or in addition!

Examples:
- " - Contents: (1) ..."
- " - Inhalt: https://d-nb.info/1256712809/04</ABH>" (URI pattern: "https://d-nb.info/" + "1256712809" 10 digits + "/04")

Example:

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdfs:label "(1) Wünsche, J., Weidmann, R. &amp; Grob, A. (n. d.). Happy in the same way? The link between domain satisfaction and overall life satisfaction in romantic couples. Manuscript submitted for publication. (2) Wünsche, J., Weidmann,...";
] .
```

Or

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdf:value "https://d-nb.info/1002790794/04"^^xsd:anyURI ;
] .
```

In [204]:
def get_bf_toc(work_uri, record):
    # read the abstract in ABH
    contents = ""
    if record.find("ABH") is not None:
        abstracttext = record.find("ABH").text.strip()
        # check via regex if there is a " - Inhalt: " or " - Contents: " in it.
        # if so, split out what comes after. Drop the contents/inhalt part itself.
        match = re.search(r'^(.*)[-–]\s*(?:Contents|Inhalt)\s*:\s*(.*)$', abstracttext)
        if match:
            abstracttext = match.group(1).strip()
            contents = match.group(2).strip()

    # also check if what comes is either a string or a uri following thegiven pattern
    # and export one as a rdfs_label and the other as rdf:value "..."^^xsd:anyUrl (remember to add XSD namespace!)
    # also remember that we should only create a node and attach it to the work
    # if a) ABH exists at all and
    # b) the regex is satisfied.
    # So I guess we must do the whole checking and adding procedure in this function!

    # only return an added triple if the toc exisits, otherwise return nothing:
    if contents:
        return records_bf.add((work_uri, BF.tableOfContents, Literal(contents)))
    else: 
        return None
    # return records_bf.add((work_uri, BF.tableOfContents, Literal("test")))

## Function: Create Person Contribution nodes from Fields AUP, EMID, EMAIL, AUK, PAUP, CS and COU

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:contribution 
    [
        # the Bibframe Contribution includes, as usual, an agent and their role,
        # but is supplemented with an Affiliation (in the context of that work/while it was written),
        # and a position in the author sequence.
        a bf:Contribution, bflc:PrimaryContribution; 
        bf:agent 
        [
            a bf:Person, schema:Person; 
            rdfs:label "Trillitzsch, Tina"; # name when creating work
            schema:givenName "Tina"; schema:familyName "Trillitzsch";
            owl:sameAs <https://w3id.org/zpid/person/tt_0000001>, <https://orcid.org/0000-0001-7239-4844>; # authority uris of person (local, orcid)
            bf:identifiedBy [a bf:Local, pxc:PsychAuthorsID; rdf:value "p01979TTR"; #legacy authority ID
            ];
            bf:identifiedBy [a bf:Identifier, locid:orcid; rdf:value "0000-0001-7239-4844"; # ORCID 
            ];
        ]
        # we use a model inspired by Option C in Osma Suominen'a suggestion for https://github.com/dcmi/dc-srap/issues/3
        # adding the Affiliation into the Contribution, separate from the agent itself, since the affiliation
        # is described in the context of this work, not not as a statement about the person's
        # current affiliation:
        mads:hasAffiliation [
            a mads:Affiliation;
            # Affiliation blank node has info about the affiliation org (including persistent identifiers),
            # the address (country with geonames identifier),
            # and the person's email while affiliated there.
            mads:organization [
                a bf:Organization; 
                rdfs:label "Leibniz Institute of Psychology (ZPID); Digital Research Development Services"; # org name when work was created
                owl:sameAs <https://w3id.org/zpid/org/zpid_0000001>, <https://ror.org/0165gz615>; # authority uris of org (local, ror)
                # internal id and ror id as literal identifiers:
                bf:identifiedBy [a bf:Local, pxc:ZpidCorporateBodyId; rdf:value "0000001"; ];
                bf:identifiedBy [a bf:Identifier; locid:ror; rdf:value "0165gz615"; ];
            ];
            mads:hasAffiliationAddress [a mads:Address;
                mads:country [
                    a mads:Country, bf:Place;
                    rdfs:label "Germany";
                    bf:identifiedBy [a bf:Identifier, locid:geonames; rdf:value "2921044"; ];
                    owl:sameAs <https://w3id.org/zpid/place/country/ger>;
                ]
            ];
            mads:email <mailto:ttr@leibniz-psychology.org>; # correspondence author email
        ];
        bf:role <http://id.loc.gov/vocabulary/relators/aut>;
        pxp:contributionPosition 1; bf:qualifier "first"; # first author in sequence: our own subproperty of bf:qualifier & schema:position (also: middle, last)
    ].
```

Todos:
- [x] create blank node for contribution and add agent of type bf:Person
- [x] add author position (first, middle, last plus order number) to the contribution
- [x] make first author a bflc:PrimaryContribution
- [x] match AUP with PAUP to get person names and ids (normalize first)
- [x] extend AUP-PAUP match with lookup in kerndaten table/ttl to compare schema:alternatename of person with name in AUP (but first before normalization)
- [x] add ORCID to the person's blank node (doesn't add 4 ORCIDs for unknown reason - maybe duplicates?)
- [x] add EMAIL to person's blank node (either to person in EMID or to first author)
- [x] add affiliation from CS field and COU field to first author
- [x] add Affiliation blank node with org name, country to each author that has these subfields in their AUP (|i and |c)
- [x] add role from AUP subfield |f
- [x] add country geonames id using lookup table
- [ ] move mads:email Literal from bf:Contribution to mads:Affiliation
- [ ] later: reconcile affiliations to add org id, org ror id (once we actually have institution authority files)


In [205]:
from calendar import c


def add_bf_contributor_person_role(role):
    # return role_uri
    return URIRef(ROLES + role)

def country_geonames_lookup(country):
    cases = [
        ("Germany", "2921044"),
        ("United States", "6252001"),
        ("United Kingdom", "2635167"),
        ("Austria", "2782113"),
        ("Switzerland", "2658434"),
        ("Netherlands", "2750405"),
        ("Belgium", "2802361"),
        ("France", "3017382"),
        ("Italy", "3175395"),
        ("Spain", "2510769"),
        ("Japan", "1861060"),
        ("Bulgaria", "732800"),
        ("Hungary", "719819"),
        ("Czech Republic", "3077311"),
        ("Portugal", "2264397"),
        ("Russia", "2017370"),
        ("Poland", "798544"),
        ("Greece", "390903"),
        ("Sweden", "2661886"),
        ("Denmark", "2623032"),
        ("Luxembourg", "2960313"),
        ("Taiwan", "1668284"),
        ("Norway", "3144096"),
        ("Finland", "660013"),
        ("Ireland", "2963597"),
        ("Canada", "6251999"),
        ("Australia", "2077456"),
        ("New Zealand", "2186224"),
        ("South Africa", "953987"),
        ("People's Republic of China", "1814991"),
        ("Turkey", "298795"),
        ("Brazil", "3469034"),
        ("Cuba", "3562981"),
        ("Georgia", "614540"),
        ("Iran", "130758"),
        ("India", "1269750"),
    ]
    for case in cases:
        if case[0].casefold() == str(country).casefold():
            return case[1]
    return None

def normalize_names(familyname,givenname):
    familyname_normalized = familyname.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("Ä", "Ae").replace("Ö", "Oe").replace("Ü", "Ue").replace("ß", "ss")
    # generate an abbreviated version of givenname (only the first letter), but 
    if givenname:
        givenname_abbreviated = givenname[0] + "."
        # generate a normalized version of the name by concatenating the two with a comma as the separator:
        fullname_normalized = familyname_normalized + ", " + givenname_abbreviated
    return fullname_normalized

def match_paup(record, person_node, personname_normalized):
    # loop through all PAUPs and check if the name matches the normalized personname
    for paup in record.findall("PAUP"):
        # given a string such as "Forkmann, Thomas |n p06946TF |u https://www.psychauthors.de/psychauthors/index.php?wahl=forschung&amp;#38;uwahl=psychauthors&amp;#38;uuwahl=p06946TF"
        # split into family name, given name and paId where "Forkmann" is the family name, "Thomas" is the given name and "p06946TF" is the paId:
        paup_split = paup.text.strip().split("|n")[0].strip().split(",")
        if len(paup_split) > 1:
            paup_familyname = paup_split[0].strip()
            paup_givenname = paup_split[1].strip()
            paId = paup.text.strip().split("|n")[1].strip().split("|")[0].strip()
            
            # generate a normalized version of paup_familyname:
            paup_name_normalized = normalize_names(paup_familyname,paup_givenname)
            # generate a uri for the person from the paId that can match the one in kerndaten.ttl generated from psychauthors database:
            person_uri = URIRef("https://w3id.org/zpid/person/" + paId)
            
            # now check if the normalized name from PAUP matches the normalized person name from AUP:
            # if they match and there is a matching person in the kerndaten.ttl graph, add the person uri as schema:sameAs and the current preferred name from psychauthors as schema:preferredName, then return the paId:
            if paup_name_normalized == personname_normalized and (person_uri, RDF.type, SCHEMA.Person) in kerndaten:
                # for debugging, print the actual name in the matching PAUP:
                #records_bf.add((person_node, PXP.paupName, Literal(paup_familyname + ", " + paup_givenname)))
                records_bf.add((person_node, SCHEMA.sameAs, person_uri))
                # add the preferred name from kerndaten as schema:preferredName:
                records_bf.add((person_node, SCHEMA.preferredName, kerndaten.value(person_uri, SCHEMA.name)))
                # return the psychauthors ID:
                return paId
            # but if PAUP and AUP names are no match, even normalized,
            # go through all the alternate names in kerndaten for that Psychauthors ID and check if they match the normalized person name from AUP (this will even find completely changed names, from maiden name to married name etc.):
            elif paup_name_normalized != personname_normalized and (person_uri, RDF.type, SCHEMA.Person) in kerndaten:
                for alternatename in kerndaten.objects(person_uri, SCHEMA.alternateName):
                    # split alternatename into first and last name:
                    alternatename_split = alternatename.split(",")
                    if len(alternatename_split) > 1:
                        alternatename_familyname = alternatename_split[0].strip()
                        alternatename_givenname = alternatename_split[1].strip()
                        # generate a normalized version of alternatename_familyname to compare with PAUP name later:
                        alternatename_normalized = normalize_names(alternatename_familyname,alternatename_givenname)
                        if personname_normalized == alternatename_normalized:
                            # we have found another match!
                            # add the uri as schema:sameAs and put the current preferred name from psychauthors here, too (for debugging purposes):
                            records_bf.add((person_node, SCHEMA.sameAs, person_uri))
                            records_bf.add((person_node, SCHEMA.preferredName, kerndaten.value(person_uri, SCHEMA.name)))
                            return paId
            else:
                return None

def get_orcid(record, person_node, personname):
    # loop through all ORCIDs and check if the name matches the personname
    for orcid in record.findall("ORCID"):
        # go through all ORCID fields and check for matches of personname with the text before "|u": 
        # split the orcid string into the orcid id and the name:
        orcid_split = orcid.text.strip().split("|u")
        
        # if there is a name part, compare it to the personname:
        if len(orcid_split) > 1:
            orcid_name = orcid_split[0].strip()
            orcidId = orcid_split[1].strip()
            # clean up the orcid_id by removing spaces that sometimes sneak in when entering them in the database:
            orcidId = orcidId.replace(" ", "")
            
            # by the way, here is a regex pattern for valid orcids:
            # orcid_pattern = re.compile(r'^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$')
            # and a way, check if the orcid id matches the pattern:
            # if not orcid_pattern.match(orcidId):
            #     print("invalid orcid: " + orcidId)

            # if the name matches, return the orcid id for adding it to the person node:
            if orcid_name == personname:
                return orcidId
            else:
                return None

def build_affiliation_nodes(person_affiliation, person_affiliation_country):
    # is passed two string: the affiliation name and the affiliation country name
    # make a blank node for the affiliation and make it class mads:Affiliation:
    person_affiliation_node = BNode()
    records_bf.add((person_affiliation_node, RDF.type, MADS.Affiliation))
    # make a blank node for the affiliation organization and make it class bf:Organization:
    person_affiliation_org_node = BNode()
    records_bf.add((person_affiliation_org_node, RDF.type, BF.Organization))
    # add the affiliation organization node to the affiliation node:
    records_bf.add((person_affiliation_node, MADS.organization, person_affiliation_org_node))
    # add the affiliation string to the affiliation org node:
    records_bf.add((person_affiliation_org_node, RDFS.label, Literal(person_affiliation)))

    # make a blank node for the affiliation address and make it class mads:Address:
    person_affiliation_address_node = BNode()
    records_bf.add((person_affiliation_address_node, RDF.type, MADS.Address))
    # add a country node to the affiliation address node:
    person_affiliation_country_node = BNode()
    records_bf.add((person_affiliation_country_node, RDF.type, MADS.Country))
    # add the country node to the affiliation address node:
    records_bf.add((person_affiliation_address_node, MADS.country, person_affiliation_country_node))
    # add the affiliation address string to the affiliation address node:
    records_bf.add((person_affiliation_country_node, RDFS.label, Literal(person_affiliation_country)))

    # if the country is in the geonames lookup table, add the geonames uri as sameAs and the geonames id as an identifier:
    if country_geonames_lookup(person_affiliation_country):
        geonamesId = country_geonames_lookup(person_affiliation_country)
        geonames_uri = URIRef("http://geonames.org/" + geonamesId + "/")
        records_bf.add((person_affiliation_country_node, SCHEMA.sameAs, geonames_uri))
        # add the geonames identifier:
        person_affiliation_country_identifier_node = BNode()
        records_bf.add((person_affiliation_country_identifier_node, RDF.type, BF.Identifier))
        records_bf.add((person_affiliation_country_identifier_node, RDF.type, LOCID.geonames))
        records_bf.add((person_affiliation_country_identifier_node, RDF.value, Literal(geonamesId)))
        records_bf.add((person_affiliation_country_node, BF.identifier, person_affiliation_country_identifier_node))
    # add the affiliation address node to the affiliation node:
    records_bf.add((person_affiliation_node, MADS.hasAffiliationAddress, person_affiliation_address_node))

    # return the finished affiliation node with all its children and attached strings:
    return person_affiliation_node

# the full function that creates a contribution node for each person in AUP:
# first, get all AUPs in a record and create a blank node for each of them
def add_bf_contributor_person(work_uri, record):
    # initialize a counter for the contribution position and a variable for the contribution qualifier:
    contribution_counter = 0
    contribution_qualifier = None
    
    for person in record.findall("AUP"):
        # count how often we've gone through the loop to see the author position:
        contribution_counter += 1
        # make a blank node for the bf:Contribution:
        # contribution_node = BNode()
        contribution_node = URIRef(work_uri + "/contribution/" + str(contribution_counter))
        records_bf.add((contribution_node, RDF.type, BF.Contribution))
        
        # make a blank node for the person:
        person_node = BNode()
        records_bf.add((person_node, RDF.type, BF.Person))

        

        # add the counter as an author position to the contribution node:
        records_bf.add((contribution_node, PXP.contributionPosition, Literal(contribution_counter)))

        # if we are in the first loop, set "contrution_qualifier" to "first":
        if contribution_counter == 1:
            contribution_qualifier = "first"
            records_bf.add((contribution_node, RDF.type, BFLC.PrimaryContribution))
        # if we are in the last loop, set "contribution_qualifier" to "last":
        elif contribution_counter == len(record.findall("AUP")):
            contribution_qualifier = "last"
        # if we are in any other loop but the first or last, set "contribution_qualifier" to "middle":
        else:
            contribution_qualifier = "middle"

        # add the contribution qualifier to the contribution node:
        records_bf.add((contribution_node, BF.qualifier, Literal(contribution_qualifier)))

        # add the name from AUP to the person node, but only use the text before the first |:
        personname = person.text.strip().split("|")[0].strip()
        records_bf.add((person_node, RDFS.label, Literal(personname)))        

        # initialize variables for later use:
        personname_normalized = None
        orcidId = None

        # split personname into first and last name:
        personname_split = personname.split(",")
        if len(personname_split) > 1:
            familyname = personname_split[0].strip()
            givenname = personname_split[1].strip()
            records_bf.add((person_node, SCHEMA.familyName, Literal(familyname)))
            records_bf.add((person_node, SCHEMA.givenName, Literal(givenname)))
            # generate a normalized version of familyname to compare with PAUP name later:
            personname_normalized = normalize_names(familyname,givenname)
            # for debugging, print the normalized name:
            # records_bf.add((person_node, PXP.normalizedName, Literal(personname_normalized)))

        # call the function match_paup to match the personname from AUP with the PAUPs:
        paId = match_paup(record, person_node, personname_normalized)
        if paId is not None:
            # create a blank node for the identifier:
            # we coulkd do this into the function, but then I will have to return something else
            psychauthors_identifier_node = BNode()
            records_bf.add((psychauthors_identifier_node, RDF.type, BF.Identifier))
            records_bf.add((psychauthors_identifier_node, RDF.type, BF.Local))
            records_bf.add((psychauthors_identifier_node, RDF.type, PXC.PsychAuthorsID))
            records_bf.add((psychauthors_identifier_node, RDF.value, Literal(paId)))
            # add the identifier node to the person node:
            records_bf.add((person_node, BF.identifiedBy, psychauthors_identifier_node))
            # create a urL from the paid and add it as a "webpage describing this entity" to the person node:
            psychauthors_url = "https://www.psychauthors.de/psychauthors/index.php?wahl=forschung&uwahl=psychauthors&uuwahl=" + paId
            records_bf.add((person_node, SCHEMA.mainEntityOfPage, URIRef(psychauthors_url)))
        
        # call the function get_orcid to match the personname with the ORCIDs in the record:
        orcidId = get_orcid(record, person_node, personname)
        if orcidId is not None:
            # create a blank node for the identifier:
            orcid_identifier_node = BNode()
            records_bf.add((orcid_identifier_node, RDF.type, BF.Identifier))
            records_bf.add((orcid_identifier_node, RDF.type, LOCID.orcid))
            records_bf.add((orcid_identifier_node, RDF.value, Literal(orcidId)))
            # add the identifier node to the person node:
            records_bf.add((person_node, BF.identifiedBy, orcid_identifier_node))
            # add the orcid id as a sameAs link to the person node:
            orcid_uri = "https://orcid.org/" + orcidId
            records_bf.add((person_node, SCHEMA.sameAs, URIRef(orcid_uri)))

        ## ----- 
        # Getting Affiliations and their countries from first, CS and COU (only for first author), and then from subfields |i and |c in AUP (for newer records)
        ## -----
        
        # initialize variables we'll need for adding affiliations and country names from AUP |i and CS/COU/ADR:
        affiliationstring = None
        affiliation_country = None
      
        # match affiliations in CS and COU to first contribution/author:
        # dont add ADR here yet (even if this is the place for it - we may drop that info anyway.
        # look for the field CS:
        # if the contribution_counter is 1 (i.e. if this is the first loop/first author), add the affiliation to the person node:
        if contribution_counter == 1:
            if record.find("CS") is not None:
                # get the content of the CS field:
                affiliationstring = record.find("CS").text.strip()

            if record.find("COU") is not None:
                # get the country from the COU field:
                affiliation_country = record.find("COU").text.strip()

                
        ## Get affiliation from AUP |i, country from |c:
        # no looping necessary here, just check if a string |i exists in AUP and if so, add it to the person node:
        # if AUP contains "|i ", use anything after it and before the end of the string or before another "|" as the affiliation string:
        if person.text.strip().find("|i ") > -1:
            # save that text in a variable:
            affiliationstring = person.text.strip().split("|i")[1].strip().split("|")[0].strip()
            
        # now check if there is a country in |c:
        if person.text.strip().find("|c ") > -1:
            # save that text in a variable:
            affiliation_country = person.text.strip().split("|c")[1].strip().split("|")[0].strip()

        # pass this to function build_affiliation_nodes to get a finished affiliation node:
        if affiliationstring is not None:
            affiliation_node = build_affiliation_nodes(affiliationstring, affiliation_country)
            # add the affiliation node to the contribution node:
            records_bf.add((contribution_node, MADS.hasAffiliation, affiliation_node))

        # look for the field EMAIL:
        email = None
        # TODO: the email address actually belongs into the affiliation section, but we'll leave it directly in the contribution node for now:
        if record.find("EMAIL") is not None:
            # get the email address from the EMAIL field, replacing spaces with underscores (common problem in urls in star) and adding a "mailto:" prefix:
            email = "mailto:" + record.find("EMAIL").text.strip().replace(" ", "_")
            # email = "mailto:" + record.find("EMAIL").text.strip()
            # if there is no EMID and the contribution_counter is 1 (i.e. if this is the first loop), add the email to the person node:
            if record.find("EMID") is None and contribution_counter == 1:
                records_bf.add((contribution_node, MADS.email, URIRef(email)))
            # else match the existing EMID field to the personname:
            elif record.find("EMID") is not None and record.find("EMID").text.strip() == personname:
                records_bf.add((contribution_node, MADS.email, URIRef(email))) 
                
            
        role = None
        # check if there is a role in the AUP field:
        if person.text.strip().find("|f ") > -1:
            # save that text in a variable:
            role = person.text.strip().split("|f")[1].strip().split("|")[0].strip()
            # add it to the contribution node:
            records_bf.add((contribution_node, BF.role, add_bf_contributor_person_role(role)))
        # if there isn't, the role is "AU" by default:
        else:
            records_bf.add((contribution_node, BF.role, add_bf_contributor_person_role("AU")))

        ## --- Add the contribution node to the work node:
        records_bf.add((work_uri, BF.contribution, contribution_node))
        # add the person node to the contribution node as a contributor:
        records_bf.add((contribution_node, BF.agent, person_node))    

## TODO: Function: Create Topics, Weighted Topics and Classifications from CT, SH

Maybe try lookup with Skosmos?

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:subject [a bf:Topic, pxc:WeightedTopic, skos:Concept; # # topic, weighted
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/35365>;
        rdfs:label "Ontologies"@en, "Ontologien"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    bf:subject [a bf:Topic, skos:Concept; # a non-weighted topic
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/60135>;
        rdfs:label "Semantic Networks"@en, "Semantische Netzwerke"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    # PSYNDEX subject heading classification
    bf:classification [ a bf:Classification, pxc:SubjectHeading, skos:Concept;
        rdfs:label "Professional Psychological & Health Personnel Issues"@en;
        bf:code "3400";
        owl:sameAs <https://w3id.org/zpid/vocabs/class/3400>;
        bf:source <https://w3id.org/zpid/vocabs/class>;
    ].
```

## TODO: Function: Create nodes for Population Age Group (AGE) and Population Location (PLOC)

Use this scheme:

```turtle
<Work> 
# age group study is about/sample was from:
    bflc:demographicGroup [a bflc:DemographicGroup, pxc:AgeGroup, skos:Concept;
        rdfs:label "Adulthood"@en, "Erwachsenenalter"@de;
        owl:sameAs <https://w3id.org/zpid/vocabs/age/adulthood>;
        bf:source <https://w3id.org/zpid/vocabs/age/AgeGroups>; 
    ];
    # population location: 
    bf:geographicCoverage [a bf:GeographicCoverage, pxc:PopulationLocation, skos:Concept;
        rdfs:label "Germany"@en;
        owl:sameAs <countries/ger>;
    ].
```

## Function: Create nodes for PRREG (linked Preregistration Works)

Field PRREG can occur multiple times per record (0..n). 
It contains a link and/or DOI to a preregistration document. 

Possible subfields:
- |u URL linking to the document
- |d DOI for the document
- |i additional Info text

There are many errors we could catch here. 
- [x] Most importantly, we can replace any " " with "_" in the |u.
- [x] Also, |d should contain pure DOIs with prefixes, so they should start with "10." If they don't, remove any prefixes to make a "pure" DOI.
- [x] remove or ignore any empty subfields that may exist (|u, |d, |i)

Example:

```turtle
<https://w3id.org/zpid/pub/work/0003> a bf:Work; 
    bflc:relationship 
    [
        a bflc:Relationship;
        bflc:relation relations:hasPreregistration;
        bf:note [a bf:Note; rdfs:label "Australian Sample"];
        bf:supplement # may change, not sure?
        [
            a bf:Work, bf:Text; 
            bf:genreForm genres:preregistration; 
            bf:content content:text;
            bf:hasInstance 
            [
                a bf:Instance;
                bf:electronicLocator <https://osf.io/prereg1>;
                bf:identifier [a bf:Identifier, bf:Doi; rdf:value "10.123code003"];
                # add bf:media "computer" from rda media types
                bf:media <http://rdvocab.info/termList/RDAMediaType/1003>;
                # bf:carrier "online resource" from rda vocabulary
                bf:carrier <http://rdvocab.info/termList/RDACarrierType/1018>;
            ]
        ] 
    ]
.
```

In [206]:
# function to build the nodes for preregistration links
def get_bf_preregistrations(work_uri, record):
    # get the preregistration link from the field PREREG:
    preregistration_link = ""
    for prreg in record.findall("PRREG"):
        # build a blank node for bflc:Relationship:
        preregistration_node = BNode()
        records_bf.add((preregistration_node, RDF.type, BFLC.Relationship))
        # add a blank node for the Work we will link:
        preregistration_work_node = BNode()
        records_bf.add((preregistration_work_node, RDF.type, BF.Work))
        # make the work a bf:Text, because preregistrations always are:
        records_bf.add((preregistration_work_node, RDF.type, BF.Text))
        # add a bf:content of <https://w3id.org/zpid/vocabs/contenttypes/> to the work:
        records_bf.add((preregistration_work_node, BF.content, URIRef("https://w3id.org/zpid/vocabs/contenttypes/text")))
        # add a bf:genreForm of <https://w3id.org/zpid/vocabs/genres/preregistration>:
        records_bf.add((preregistration_work_node, BF.genreForm, URIRef("https://w3id.org/zpid/vocabs/genres/preregistration")))
        # add an instance to hang bf:electronicLocator and bf:Doi on:
        preregistration_instance_node = BNode()
        records_bf.add((preregistration_instance_node, RDF.type, BF.Instance))
        # add a bf:media of <http://rdvocab.info/termList/RDAMediaType/1003> (computer) to the instance, because preregistrations are always online (they have a url or doi, after all):
        records_bf.add((preregistration_instance_node, BF.media, URIRef("http://rdvocab.info/termList/RDAMediaType/1003")))
        # and also a bf:carrier <http://rdvocab.info/termList/RDACarrierType/1018> (online resource):
        records_bf.add((preregistration_instance_node, BF.carrier, URIRef("http://rdvocab.info/termList/RDACarrierType/1018"))) 
        # add the instance to the work:
        records_bf.add((preregistration_work_node, BF.hasInstance, preregistration_instance_node))
        # add the work to the relationship:
        records_bf.add((preregistration_node, BF.supplement, preregistration_work_node))
        # add a bflc:relation of <<https://w3id.org/zpid/vocabs/relations/hasPreregistration> to the relationship:
        records_bf.add((preregistration_node, BFLC.relation, URIRef("https://w3id.org/zpid/vocabs/relations/hasPreregistration")))
        # get a preregistration link from the PREREG subfield |u
        # but only if a |u exists:
        if prreg.text.strip().find("|u") > -1:
            preregistration_link = prreg.text.strip().split("|u")[1].strip().split("|")[0].strip()
            # replace any spaces in the link with underscores:
            preregistration_link = preregistration_link.replace(" ", "_")
            # add a bf:electronicLocator to the instance:
            records_bf.add((preregistration_instance_node, BF.electronicLocator, URIRef(preregistration_link)))

        # get a DOI from the PREREG subfield |d, if it exists:
        if prreg.text.find("|d") > -1:
            preregistration_doi = prreg.text.strip().split("|d")[1].strip().split("|")[0].strip()
            # if this string is not empty:
            if preregistration_doi !="":
                # check if it is even a DOI at all (i.e. starts with "10."):
                # make a regex pattern for dois that says they must start with "10.":
                doi_pattern = re.compile(r'^10\..*')
                # if the string matches the pattern, keep it:
                if doi_pattern.match(preregistration_doi):
                    pass
                # if it has anything before the "10.", remove thet part that comes before the "10.", but do keep "10." any anything after it:
                else:
                    preregistration_doi = re.sub(r'^.*10\.', '10.', preregistration_doi)

                # add a bf:Doi to the instance using bf:identidiedBy and a blank node for the identifier (bf:Doi):
                preregistration_doi_node = BNode()
                records_bf.add((preregistration_doi_node, RDF.type, BF.Identifier))
                records_bf.add((preregistration_doi_node, RDF.type, BF.Doi))
                records_bf.add((preregistration_doi_node, RDF.value, Literal(preregistration_doi)))
                records_bf.add((preregistration_instance_node, BF.identifiedBy, preregistration_doi_node))
                # also generate a doi url and add it as a schema:sameAs link:
                preregistration_doi_url = "https://doi.org/" + preregistration_doi
                records_bf.add((preregistration_instance_node, SCHEMA.sameAs, URIRef(preregistration_doi_url)))
            
            # get note from the PREREG subfield |i, if it exists
            if prreg.text.find("|i") > -1:
                preregistration_note = prreg.text.strip().split("|i")[1].strip().split("|")[0].strip()
                if preregistration_note !="":
                    # create a bf:Note node:
                    preregistration_note_node = BNode()
                    records_bf.add((preregistration_note_node, RDF.type, BF.Note))
                    # add the note to the note node via rdfs:label:
                    records_bf.add((preregistration_note_node, RDFS.label, Literal(preregistration_note)))
                    # add the note to the relationship node via bf:note:
                    records_bf.add((preregistration_node, BF.note, preregistration_note_node))


        # add preregistration_node to work:
        records_bf.add((work_uri, BFLC.relationship, preregistration_node))

# The Loop!
## Creating the Work and Instance uris and adding other triples via functions

### Uris and types for Bibframe profile

We want two URIs, since we split the Records into (at first) one work and one instance, which will be linked together.
We also say one will be a (rdf:type) bf:Work and the other bf:Instance.
Then we print all these triples into a file for the bibframe profile.

In [207]:
# print(len(root.findall("Record")))


for record in root.findall("Record"):

    # get the DFK identifier from the record:
    dfk = record.find("DFK").text

    # create a URI for the work and the instance and give them their correct bf classes:
    work_uri = WORKS[dfk]
    records_bf.add((work_uri, RDF.type, BF.Work))
    instance_uri = INSTANCES[dfk]
    records_bf.add((instance_uri, RDF.type, BF.Instance))

    # connect work and instance via bf:instanceOf and bf:hasInstance:
    records_bf.add((instance_uri, BF.instanceOf, work_uri))
    records_bf.add((work_uri, BF.hasInstance, instance_uri))

    # add an identifier bnode to the work using a function:
    records_bf.add((instance_uri, BF.identifiedBy, get_bf_identifier_dfk(instance_uri, dfk)))
 

    # get field TI and add as title node:
    records_bf.add((instance_uri, BF.title, get_bf_title(instance_uri, record)))

    # get work language from LA
    records_bf.add((work_uri, BF.language, get_work_language(record)))

    # get TIUE field and add as translated title node:
    # but only if the field exists!
    if record.find("TIUE") is not None and record.find("TIUE").text != "":
        records_bf.add((instance_uri, BF.title, get_bf_translated_title(instance_uri, record)))


    # get and add contributors:
    # records_bf.add((work_uri, BF.contribution, add_bf_contributor_person(record)))
    add_bf_contributor_person(work_uri, record)
    # get toc, if it exists:
    get_bf_toc(work_uri, record)
    
    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABH") is not None:
        records_bf.add((work_uri, BF.summary, get_bf_abstract(work_uri, record)))

    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABN") is not None:
        records_bf.add((work_uri, BF.summary, get_bf_secondary_abstract(work_uri, record)))

    # get and add preregistration links:
    get_bf_preregistrations(work_uri, record)


# print all the resulting triples:
records_bf.serialize("ttl-data/bibframe_records.ttl", format="turtle")
records_bf.serialize("ttl-data/bibframe_records.jsonld", format="json-ld")
print(len(records_bf), "triples")

93440 triples


### Uris and types for simplified profile (schema-org)

For the simplified profile, we only need one entity per record (for now) and we give it the class schema:CreativeWork.
Then we print the resulting triples into a separate file for the simplified profile that mostly uses schema.org properties and classes.

In [208]:


# print(len(root.findall("Record")))

# for record in root.findall("Record"):
#     # get the DFK identifier from the record:
#     dfk = record.find("DFK").text

#     # create a URI for the work by attaching the dfk to the works namespace and 
#     # then give it the correct schema.org class:
#     work_uri = WORKS[dfk]
#     records_schema.add((work_uri, RDF.type, SCHEMA.CreativeWork))

#     # get work language from LA
#     records_schema.add((work_uri, SCHEMA.inLanguage, get_work_language(record)))


# records_schema.serialize("ttl-data/schema_records.jsonld", format="json-ld")
# # records_schema.serialize("ttl-data/schema_records.ttl", format="turtle")
# print(len(records_schema), "triples")