# Star2BF - Star to Bibframe

Import libraries:

In [12]:
from rdflib import Graph, Literal
from rdflib.namespace import RDF, RDFS, Namespace
# from rdflib.namespace import SCHEMA, XSD
from rdflib import BNode
import xml.etree.ElementTree as ET
import re

Create an "element tree" from the records in my xml file so we can loop through them and do things with them:

In [13]:
root = ET.parse("xml-data/records-440.xml")

# To see the source xml's structure, uncomment this function:
# def print_element(element, depth=0):
#     print("\t"*depth, element.tag, element.attrib, element.text)
#     for child in element:
#         print_element(child, depth+1)

# for child in root.getroot()[:2]:
#     print_element(child)


We first set a few namespace objects for bibframe, schema.org and for our resources (the works and instances) 
themselves.

Then, we create two graphs from the xml source file, one to generate triples for our bibframe profile output, and the other for the simplified schema.org profile. 

Finally, we bind the prefixes with their appropriate namespaces to the graphs.

In [14]:
BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
BFLC = Namespace("http://id.loc.gov/ontologies/bflc/")
SCHEMA = Namespace("http://schema.org/")
WORKS = Namespace("https://w3id.org/zpid/resources/works/")
INSTANCES = Namespace("https://w3id.org/zpid/resources/instances/")
PXC = Namespace("https://w3id.org/zpid/ontology/classes/")
PXP = Namespace("https://w3id.org/zpid/ontology/properties/")
LANG = Namespace ("http://id.loc.gov/vocabulary/iso639-2/")

# graph for bibframe profile:
records_bf = Graph()

# we need a new graph for the schema.org profile, so it won't just reuse the old triples from the other profile
records_schema = Graph()

# Bind the namespaces to the prefixes we want to see in the output:
records_bf.bind("bf", BF) 
records_bf.bind("bflc", BFLC) 
records_bf.bind("works", WORKS)  
records_schema.bind("works", WORKS) 
records_bf.bind("instances", INSTANCES) 
records_bf.bind("pxc", PXC) 
records_bf.bind("pxp", PXP) 
records_bf.bind("lang", LANG) 
records_schema.bind("instances", INSTANCES) 
# todo: find out why the output uses "schema1" instead of "schema" for the schema.org namespace:
records_schema.bind("schema", SCHEMA, override=True) 


# Functions to do all the things

We need functions for the different things we will do - to avoid one long monolith of a loop.

This is where they will go. Examples: Create blank nodes for Idebtifiers, create nested contribution objects from disparate person entries in AUP, AUK, CS and COU fields, merge PAUP (psychauthor person names and ids) with the person's name in AUP...

These functions will later be called at the bottom of this notebook, in a loop over all the xml records.

## Function: Adding DFK as an Identifier

### DFK as id for Bibframe

We want to add the DFK as a local bf:Identifier to the work (or instance?). 
We also want to say where the Identifier originates (to say it is from PSYNDEX/ZPID). 

The format for that is:
```turtle
<Work/Instance> bf:identifiedBy [
    a bf:Local, pxc:DFK; 
    rdf:value "1234456"; 
    bf:source [
        a bf:Source; bf:code "ZPID.PSYNDEX.DFK"
    ]
];
```

So, we need a blank node for the Identifier and inside, another nested bnode for the bf:Source. This is a function that will return such an identifier bnode to add to the work_uri. We are calling it way up down below in the loop:

In [15]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_identifier_dfk(dfk):
    # make a  BNODE of the Identifier class from the BF namespace:
    identifier = BNode()
    identifier_source = BNode()
    # records_bf.add ((identifier, RDF.type, BF.Identifier))
    records_bf.add ((identifier, RDF.type, BF.Local))
    records_bf.add ((identifier, RDF.type, PXC.DFK))
    # build the source node:
    records_bf.add((identifier_source, RDF.type, BF.Source))
    records_bf.add((identifier_source, BF.code, Literal("ZPID.PSYNDEX.DFK")))

    # hang the id source node into the id node:
    records_bf.add((identifier, BF.source, identifier_source))
    records_bf.add((identifier, RDF.value, Literal(dfk)))
    return (identifier)

## Generic Function: Replace languages with their language tag

Can be used for different fields that are converted to langstrings or language uris. Use within other functions that work with the languages in different fields.

Returns an array with two values: a two-letter langstring tag at [0] and a three-letter uri code for the library of congress language vocab at [1].

In [16]:
def get_langtag_from_field(langfield):
    # when passed a string from any language field in star, returns an array with two items. 
    # Index 0: two-letter langstring tag, e.g. "de"
    # Index 1: two-letter iso langtag, e.g. "ger"
    # can be used on these fields (it contains the different spellings found in them):
    # "LA", "LA2", "TIL", "TIUL", "ABLH", "ABLN", "TIUE |s"
    match langfield:
        case "german" | "de" | "GERM" | "Deutsch" | "GERMAN" | "GERMaN" | "German" | "Fi":
            return ["de", "ger"]
        case "en" | "ENGL" | "ENGLISH" | "Englisch" | "English" | "English; English" | "english" :
            return ["en", "eng"]
        case "BULG" | "Bulgarian":
            return ["bg", "bul"]
        case "SPAN"| "Spanish":
            return ["es", "spa"]
        case "Dutch":
            return ["nl", "dut"]
        case "CZEC":
            return ["cs", "ces"]
        case "FREN" | "French":
            return ["fr", "fra"]
        case "ITAL" | "Italian":
            return ["it", "ita"]
        case "PORT" | "Portuguese":
            return ["pt", "por"]
        case "JAPN" | "Japanese":
            return ["jp", "jpn"]
        case "HUNG":
            return ["hu", "hun"]
        case "RUSS" | "Russian":
            return ["ru", "rus"]
        case "NONE" | "Silent":
            return ["zxx", "zxx"]
        case _:
            return ["und", "und"] # for "undetermined!"

# ---
# these are also in those fields, but they are errors that should be repaired before migration!
# X$English 
# EnglishX$
# EnglishX$X$
# $English
 
# $German 
# GermanX$X$
# X$$German
# X$$GermanX$$German
# GermanX$
# GermanX$$EnglishX$$English
# GermanX$$EnglishX$$EnglishX$$English
# GermanX$English 
# GermanX$English X$English 
# GermanX$English X$English X$English 
# GermanX$English X$EnglishX$EnglishX$English
# GermanX$English; English
# GermanX$EnglishX$English
# GermanX$EnglishX$EnglishX$English
# GermanX$EnglishX$EnglishX$EnglishX$EnglishX$English
# Fi (aus TIUL) - während TIL "German" ist! Das Dokument ist auch eindeutig Deutsch.

# 4 (aus LA2) -> kann gelöscht werden, das Dokument ist in Deutsch und hat keine Zweitsprache!
# Q (aus LA2) -> kann gelöscht werden, das Dokument ist in Deutsch und hat keine Zweitsprache! 


## Function: Get work language from LA

Example

```turtle
@prefix lang: <http://id.loc.gov/vocabulary/iso639-2/> .
<W> bf:language lang:ger .
```

Calls the generic language code lookup function above, get_langtag_from_field, passing the LA field content, returning a uri from the library of congress language vocabulary (built from namespace + 3-letter iso code). 

In [17]:
# function 
def get_work_language(record):
    work_language = get_langtag_from_field(record.find("LA").text.strip())[1]
    work_lang_uri = LANG[work_language]
    return (work_lang_uri)

## Function: Create Instance Title nodes from fields TI, TIU, TIL, TIUE...

Titles and Translated titles are attached to Instances. Translated titles also have a source, which can be DeepL, ZPID, or Original.

Example:

```turtle
<Instance> bf:title 
        [a bf:Title; 
            bf:mainTitle "Disentangling the process of epistemic change"@en;
            bf:subtitle "The role of epistemic volition"@en;
        ],
        [a pxc:TranslatedTitle;
            rdfs:label "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:mainTitle "Den Prozess des epistemischen Wandels entwirren: Die Rolle des epistemischen Willens."@de;
            bf:adminMetadata  [ 
                a bf:AdminMetadata ;
                bflc:metadataLicensor  "DeepL";
        ]
        ].
```

- [x] add TI as bf:Title via bf:mainTitle
- [x] add subtitle from TIU
- [x] create a concatenated rdfs:label from TI and TIU
- [x] add languages for maintitle and subtitle (from TIL and TIUL)

- [x] add translated title from TIUE as pxc:TranslatedTitle with bf:mainTitle and rdfs:label 
- [x] add languages for translated title (from subfield TIU |s, or if unavailable, decide language based on TIL language: if de -> en and vice versa) 
- [x] find a way to create a source for the translated title (from "(DeepL)" at the end)

In [18]:
#  a function to be called in a for-loop while going through all records of the source xml, 
# which returns a new triple to add to the graph that has a bnode for the dfk identifier.
# The predicate is "bf:identifiedBy" and the object is a blank node of rdf:Type "bf:Identifier" and "bf:Local":
# The actual identifier is a literal with the text from the "DFK" element of the record.
def get_bf_title(record):
    # make a  BNODE for the title:
    title = BNode()
    # make it bf:Title class:
    records_bf.add ((title, RDF.type, BF.Title))

    # get the content of th TI field as the main title:
    maintitle = record.find("TI").text.strip()
    # write a full title for the rdfs:label 
    # (update later if subtitle exists to add that)
    fulltitle = maintitle
    # set dafault language for main title:
    maintitle_language = "en"
    subtitle_language = "en"
    # get language of main title - if exists!:
    if record.find("TIL") is not None:
        maintitle_language = get_langtag_from_field(record.find("TIL").text.strip())[0]
        # if maintitle_language_til == "German":
        #     maintitle_language = "de"
        # else: just keep the default set above: "en"
    # get language of subtitle:
    if record.find("TIUL") is not None:
        subtitle_language = get_langtag_from_field(record.find("TIUL").text.strip())[0]
        # subtitle_language_tiul = record.find("TIUL").text.strip()
        # if subtitle_language_tiul == "German":
        #     subtitle_language = "de"
        # else: just keep the default set above: "en"

    # add the content of TI etc via bf:mainTitle:
    records_bf.add((title, BF.mainTitle, Literal(maintitle, lang=maintitle_language)))
    # get content of the TIU field as the subtitle, 
    # _if_ it exists and has text in it:
    if record.find("TIU") is not None and record.find("TIU") != "":
        subtitle = record.find("TIU").text.strip() # remove extraneous spaces
        # concatenate a full title from main- and subtitle, 
        # separated with a : and overwrite fulltitle with that
        fulltitle = fulltitle + ": " + subtitle
        # add the content of TIU to the bf:Title via bf:subtitle:
        records_bf.add((title, BF.subtitle, Literal(subtitle, lang=subtitle_language)))

    # add the concatenated full title to the bf:Title via rdfs:label:
    # (we don't care if the main title's and subtitle's languages don't match - we just set the language of the main title as the full title's language)
    records_bf.add((title, RDFS.label, Literal(fulltitle, lang=maintitle_language)))

    # # hang the id source node into the id node:
    # records_bf.add((identifier, BF.source, identifier_source))
    return (title)

# function for the translated title:
def get_bf_translated_title(record):
    translated_title = BNode()
    records_bf.add ((translated_title, RDF.type, PXC.TranslatedTitle))
    fulltitle = record.find("TIUE").text.strip()
    fulltitle_language = "de"
    # TODO: find a way to read subfield |s to get the actual language. 
    # it that doesn't exist, use the inverse of TIL!
    # for now we just default to German (de)

    # if fulltitle string ends with "|s " followed by some text (use a regex):
    match = re.search(r'^(.*)\s\|s\s(.*)', fulltitle)
    if match:
        fulltitle = match.group(1).strip()
        fulltitle_language = get_langtag_from_field(match.group(2).strip())[0]
    else:
        # get the language in TIUE, if that field exists
        if record.find("TIL") is not None:
            original_title_language_til = get_langtag_from_field(record.find("TIL").text.strip())[0]
            
            # if it is German -> use inverse: "en"
            if original_title_language_til == "de":
                fulltitle_language = "en"
            # else -> keep "de"

    # check if the title contains a "(DeepL)" and cut it into a variable for the source:
    titlesource = "ZPID" # translation source is "ZPID" by default
    # note: we might be able to add source "Original" by finding out 
    # if the source of the secondary abstract is something other than ZPID!
    match_source = re.search(r'^(.*)\((DeepL)\)$', fulltitle)
    if match_source:
        fulltitle = match_source.group(1).strip()
        titlesource = match_source.group(2)

    # build a source node for the translation:
    titlesource_node = BNode ()
    records_bf.add ((titlesource_node, RDF.type, BF.AdminMetadata))
    records_bf.add ((titlesource_node, BFLC.metadataLicensor, Literal(titlesource)))

    # add the title string to the bnode:
    records_bf.add((translated_title, BF.mainTitle, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, RDFS.label, Literal(fulltitle, lang=fulltitle_language)))
    records_bf.add((translated_title, BF.adminMetadata, titlesource_node))

    return (translated_title)


## TODO: Function: Add Abstracts - original abstract (from fields ABH, ABLH, ABSH1, ABSH2) and translated/secondary abstract (from ABN, ABLN, ASN1, ASN2)

- Main Abstract: 
    - abstract text is in field ABH.
    - abstract language is in ABLH ("German" or "English")
    - abstract original source is in ASH1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASH2 ()
- Secondary Abstract 
    - abstract text is in field ABN.
    - abstract language is in ABLN ("German" or "English")
    - abstract original source is in ASN1 ("Original" or "ZPID")
    - agent who edited the original, if that happened, is in ASN2 ()

Use this scheme:

```turtle
<W> bf:summary 
    [ a pxc:Abstract , bf:Summary ;
        rdfs:label  "Background: Loneliness is ..."@en ;
        bf:adminMetadata  [ 
            a bf:AdminMetadata ;
            bflc:metadataLicensor  "Original";
            bf:descriptionModifier "ZPID"
        ]
] .
```

In [19]:
# function to get the original abstract:
def get_bf_abstract(record):
    abstract = BNode()
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    # get abstract text from ABH
    abstracttext = record.find("ABH").text.strip()
    # get abstract language from ABLH ("German" or "English")
    abstract_language = "en" # set default
    if record.find("ABLH") is not None:
        abstract_language = get_langtag_from_field(record.find("ABLH").text.strip())[0]

    # add the text to the bnode:
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))

    # get abstract original source from ASH1 ("Original" or "ZPID")
    abstract_source = "Original" # default
    # create a blank node for admin metadata:
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))

    if record.find("ASH1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        abstract_source = record.find("ASH1").text.strip()
    
    # write final source text into source node:
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # here is a list of known zpid employee tags, we will use them later to replace these with "ZPID" if found in ASH2:

    # and this is a list of things we want to replace with "Original":
    

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASH2") is not None:
        # note what we find in ABSH2:
        abstract_editor = record.find("ASH2").text.strip()
        # todo: replace known zpid person initials with "ZPID"
        # "Juergen Wiesenhuetter",
        # "Joachim H. Becker","Udo Wolff", "Juergen Beling", 
        # "Joachim H. Mueller", "Angelika Zimmer", "Annelie Wiertz", "Beate Minsel", "Berndt Zuschlag",  "Doris Lecheler", "Elke Bone", "Guenter Krampen", "Hella Lenders", "Jutta Rohlmann", "Juergen Howe", "Manfred Opitz", "Manfred Fischer", "Paul Klein", "Sigrun-Heide Filipp", "Thomas W. Franke", "Ulrike Fischer", "Yrla M. Labouvie", 
        # "K.Si", "L.F.T.", "M.G.", "I.D." , "A.Bi.", "A.G.", "A.C.", "U.R.W", "U", "C.Si", "pe.k", "r", "R.N", "Ve.K.",   

        # if "Author" or "Autor" -> "Original"
        # and what if "DeepL"???
        # or "FIS Bildung", "GESIS Fachinformation für die Sozialwissenschaften, Bonn", "Kriminologische Zentralstelle", 
        # and add it via decription modifier:
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))


    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    return (abstract)

def get_bf_secondary_abstract(record):
    abstract = BNode()
    records_bf.add ((abstract, RDF.type, PXC.Abstract))
    records_bf.add ((abstract, RDF.type, PXC.SecondaryAbstract))
    abstracttext = record.find("ABN").text.strip()
    
    abstract_language = "de" # fallback default
    if record.find("ABLN") is not None:
        abstract_language = get_langtag_from_field(record.find("ABLN").text.strip())[0]
    
    records_bf.add ((abstract, RDFS.label, Literal(abstracttext, lang=abstract_language)))
    
    abstract_source_node = BNode()
    records_bf.add((abstract_source_node, RDF.type, BF.AdminMetadata))
    abstract_source = "Original" # fallback default
    if record.find("ASN1") is not None:
        # overwrite default ("Original") with what we find in ASH1:
        abstract_source = record.find("ASN1").text.strip()
    
    records_bf.add((abstract_source_node, BFLC.metadataLicensor, Literal(abstract_source)))

    # get optional agent who edited the original abstract from ASH2
    if record.find("ASN2") is not None:
        # note what we find in ABSN2:
        abstract_editor = record.find("ASN2").text.strip()
        # and add it via decription modifier:
        records_bf.add((abstract_source_node, BF.descriptionModifier, Literal(abstract_editor)))

    #add the source node to the abstract node:
    records_bf.add((abstract, BF.adminMetadata, abstract_source_node))
    # and return the completed node:
    return (abstract)




## TODO: Function to split Table of Content from the Abstract field (ABH)

This usually starts with " - Inhalt: " (for German Abstracts) or " - Contents: " (in English abstracts) and ends at the end of the field.
It can contain a numbered list of chapters or sections as a long string. It can also contain a uri from dnb namespace instead or in addition!

Examples:
- " - Contents: (1) ..."
- " - Inhalt: https://d-nb.info/1256712809/04</ABH>" (URI pattern: "https://d-nb.info/" + "1256712809" 10 digits + "/04")

Example:

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdfs:label "(1) Wünsche, J., Weidmann, R. &amp; Grob, A. (n. d.). Happy in the same way? The link between domain satisfaction and overall life satisfaction in romantic couples. Manuscript submitted for publication. (2) Wünsche, J., Weidmann,...";
] .
```

Or

```turtle
<W> bf:tableOfContents [
    a bf:TableOfContents;
    rdf:value "https://d-nb.info/1002790794/04"^^xsd:anyURI ;
] .
```

In [20]:
def get_bf_toc(work_uri, record):
    # read the abstract in ABH
    contents = ""
    if record.find("ABH") is not None:
        abstracttext = record.find("ABH").text.strip()
        # check via regex if there is a " - Inhalt: " or " - Contents: " in it.
        # if so, split out what comes after. Drop the contents/inhalt part itself.
        match = re.search(r'^(.*)[-–]\s*(?:Contents|Inhalt)\s*:\s*(.*)$', abstracttext)
        if match:
            abstracttext = match.group(1).strip()
            contents = match.group(2).strip()

    # also check if what comes is either a string or a uri following thegiven pattern
    # and export one as a rdfs_label and the other as rdf:value "..."^^xsd:anyUrl (remember to add XSD namespace!)
    # also remember that we should only create a node and attach it to the work
    # if a) ABH exists at all and
    # b) the regex is satisfied.
    # So I guess we must do the whole checking and adding procedure in this function!

    # only return an added triple if the toc exisits, otherwise return nothing:
    if contents:
        return records_bf.add((work_uri, BF.tableOfContents, Literal(contents)))
    else: 
        return None
    # return records_bf.add((work_uri, BF.tableOfContents, Literal("test")))

## TODO: Function: Create Contribution nodes from Fields AUP, EMID, EMAIL, AUK, PAUP, CS and COU

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:contribution 
    [
        # the Bibframe Contribution includes, as usual, an agent and their role,
        # but is supplemented with an Affiliation (in the context of that work/while it was written),
        # and a position in the author sequence.
        a bf:Contribution, bflc:PrimaryContribution; 
        bf:agent 
        [
            a bf:Person, schema:Person; 
            rdfs:label "Trillitzsch, Tina"; # name when creating work
            schema:givenName "Tina"; schema:familyName "Trillitzsch";
            owl:sameAs <https://w3id.org/zpid/person/tt_0000001>, <https://orcid.org/0000-0001-7239-4844>; # authority uris of person (local, orcid)
            bf:identifiedBy [a bf:Local, pxc:PsychAuthorsID; rdf:value "p01979TTR"; #legacy authority ID
            ];
            bf:identifiedBy [a bf:Identifier, locid:orcid; rdf:value "0000-0001-7239-4844"; # ORCID 
            ];
        ]
        # we use a model inspired by Option C in Osma Suominen'a suggestion for https://github.com/dcmi/dc-srap/issues/3
        # adding the Affiliation into the Contribution, separate from the agent itself, since the affiliation
        # is described in the context of this work, not not as a statement about the person's
        # current affiliation:
        mads:hasAffiliation [
            a mads:Affiliation;
            # Affiliation blank node has info about the affiliation org (including persistent identifiers),
            # the address (country with geonames identifier),
            # and the person's email while affiliated there.
            mads:organization [
                a bf:Organization; 
                rdfs:label "Leibniz Institute of Psychology (ZPID); Digital Research Development Services"; # org name when work was created
                owl:sameAs <https://w3id.org/zpid/org/zpid_0000001>, <https://ror.org/0165gz615>; # authority uris of org (local, ror)
                # internal id and ror id as literal identifiers:
                bf:identifiedBy [a bf:Local, pxc:ZpidCorporateBodyId; rdf:value "0000001"; ];
                bf:identifiedBy [a bf:Identifier; locid:ror; rdf:value "0165gz615"; ];
            ];
            mads:hasAffiliationAddress [a mads:Address;
                mads:country [
                    a mads:Country, bf:Place;
                    rdfs:label "Germany";
                    bf:identifiedBy [a bf:Identifier, locid:geonames; rdf:value "2921044"; ];
                    owl:sameAs <https://w3id.org/zpid/place/country/ger>;
                ]
            ];
            mads:email <mailto:ttr@leibniz-psychology.org>; # correspondence author email
        ];
        bf:role <http://id.loc.gov/vocabulary/relators/aut>;
        pxp:contributionPosition 1; bf:qualifier "first"; # first author in sequence: our own subproperty of bf:qualifier & schema:position (also: middle, last)
    ].
```

## TODO: Function: Create Topics, Weighted Topics and Classifications from CT, SH

Use this scheme:

```turtle
<Work> a bf:Work;
    bf:subject [a bf:Topic, pxc:WeightedTopic, skos:Concept; # # topic, weighted
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/35365>;
        rdfs:label "Ontologies"@en, "Ontologien"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    bf:subject [a bf:Topic, skos:Concept; # a non-weighted topic
        owl:sameAs <https://w3id.org/zpid/vocabs/terms/60135>;
        rdfs:label "Semantic Networks"@en, "Semantische Netzwerke"@de;
        bf:source <https://w3id.org/zpid/vocabs/terms>;
    ];
    # PSYNDEX subject heading classification
    bf:classification [ a bf:Classification, pxc:SubjectHeading, skos:Concept;
        rdfs:label "Professional Psychological & Health Personnel Issues"@en;
        bf:code "3400";
        owl:sameAs <https://w3id.org/zpid/vocabs/class/3400>;
        bf:source <https://w3id.org/zpid/vocabs/class>;
    ].
```

## TODO: Function: Create nodes for Population Age Group (AGE) and Population Location (PLOC)

Use this scheme:

```turtle
<Work> 
# age group study is about/sample was from:
    bflc:demographicGroup [a bflc:DemographicGroup, pxc:AgeGroup, skos:Concept;
        rdfs:label "Adulthood"@en, "Erwachsenenalter"@de;
        owl:sameAs <https://w3id.org/zpid/vocabs/age/adulthood>;
        bf:source <https://w3id.org/zpid/vocabs/age/AgeGroups>; 
    ];
    # population location: 
    bf:geographicCoverage [a bf:GeographicCoverage, pxc:PopulationLocation, skos:Concept;
        rdfs:label "Germany"@en;
        owl:sameAs <countries/ger>;
    ].
```

# The Loop!
## Creating the Work and Instance uris and adding other triples via functions

### Uris and types for Bibframe profile

We want two URIs, since we split the Records into (at first) one work and one instance, which will be linked together.
We also say one will be a (rdf:type) bf:Work and the other bf:Instance.
Then we print all these triples into a file for the bibframe profile.

In [21]:
# print(len(root.findall("Record")))


for record in root.findall("Record"):

    # get the DFK identifier from the record:
    dfk = record.find("DFK").text

    # create a URI for the work and the instance and give them their correct bf classes:
    work_uri = WORKS[dfk]
    records_bf.add((work_uri, RDF.type, BF.Work))
    instance_uri = INSTANCES[dfk]
    records_bf.add((instance_uri, RDF.type, BF.Instance))

    # connect work and instance via bf:instanceOf and bf:hasInstance:
    records_bf.add((instance_uri, BF.instanceOf, work_uri))
    records_bf.add((work_uri, BF.hasInstance, instance_uri))

    # add an identifier bnode to the work using a function:
    records_bf.add((work_uri, BF.identifiedBy, get_bf_identifier_dfk(dfk)))

    # get field TI and add as title node:
    records_bf.add((instance_uri, BF.title, get_bf_title(record)))

    # get work language from LA
    records_bf.add((work_uri, BF.language, get_work_language(record)))

    # get TIUE field and add as translated title node:
    # but only if the field exists!
    if record.find("TIUE") is not None and record.find("TIUE").text != "":
        records_bf.add((instance_uri, BF.title, get_bf_translated_title(record)))


    # get toc, if it exists:
    get_bf_toc(work_uri, record)
    
    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABH") is not None:
        records_bf.add((work_uri, BF.summary, get_bf_abstract(record)))

    # get and add main/original abstract:
    # note: somehow not all records have one!
    if record.find("ABN") is not None:
        records_bf.add((work_uri, BF.summary, get_bf_secondary_abstract(record)))


# print all the resulting triples:
records_bf.serialize("ttl-data/bibframe_records.ttl", format="turtle")
records_bf.serialize("ttl-data/bibframe_records.jsonld", format="json-ld")
print(len(records_bf), "triples")

9173 triples


### Uris and types for simplified profile (schema-org)

For the simplified profile, we only need one entity per record (for now) and we give it the class schema:CreativeWork.
Then we print the resulting triples into a separate file for the simplified profile that mostly uses schema.org properties and classes.

In [22]:


# print(len(root.findall("Record")))

for record in root.findall("Record"):
    # get the DFK identifier from the record:
    dfk = record.find("DFK").text

    # create a URI for the work by attaching the dfk to the works namespace and 
    # then give it the correct schema.org class:
    work_uri = WORKS[dfk]
    records_schema.add((work_uri, RDF.type, SCHEMA.CreativeWork))

    # get work language from LA
    records_schema.add((work_uri, SCHEMA.inLanguage, get_work_language(record)))


records_schema.serialize("ttl-data/schema_records.jsonld", format="json-ld")
# records_schema.serialize("ttl-data/schema_records.ttl", format="turtle")
print(len(records_schema), "triples")

684 triples
