# Package Alma BIBFRAME for export to OCLC (experimental)

The experimental project makes use of an [SRU query](https://na03.alma.exlibrisgroup.com/view/sru/01UPENN_INST?version=1.2&operation=searchRetrieve&recordSchema=lc_bf_instance&query=alma.title_uri=%22sinopia%22&maximum_records=50) to gather all the BF Instances from Sinopia that are found in Alma. In the code cell 3, replace the values for your region and the institution code for your institution.

## Region
``region = 'https://na03.alma.exlibrisgroup.com/'``

## Institution code 
###  replace (01UPENN_INST) with your code
``institution_code = '01UPENN_INST'``

## OCLC Documentation
[OCLC RDF structure](https://help.oclc.org/Metadata_Services/WorldShare_Collection_Manager/Data_sync_collections/Prepare_your_data/Structure_BIBFRAME_data)


In [1]:
%pip install rdflib
%pip install lxml
%pip install requests
%pip install saxoncee


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# use SRU to get all the BF Instances in Alma

import requests
from lxml import etree

region = 'na03.alma.exlibrisgroup.com'
institution_code = '01UPENN_INST'
url = 'https://'+region+'/view/sru/'+institution_code+'?version=1.2&operation=searchRetrieve&recordSchema=lc_bf_instance&query=alma.title_uri=%22sinopia%22&maximumRecords=50'
response = requests.get(url)
# print(response.text)

# parse the response for the Instance URIs and collect them in a dictionary
root = etree.fromstring(response.content)

instance_dict = {}

# collect all the unique instance URIs and their corresponding bf:sameAs URIs
for record in root.iter('{http://www.loc.gov/zing/srw/}record'):
    record_data = record.find('{http://www.loc.gov/zing/srw/}recordData')
    if record_data is not None:
        rdf = record_data.find('.//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF')
        if rdf is not None:
            instance = rdf.find('.//{http://id.loc.gov/ontologies/bibframe/}Instance')
            if instance is not None:
                instance_uri = instance.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
                if instance_uri:
                    same_as = instance.find('.//{http://id.loc.gov/ontologies/bibframe/}sameAs')
                    if same_as is not None:
                        same_as_uri = same_as.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
                        if same_as_uri:
                            instance_dict[instance_uri] = same_as_uri
                        else:
                            print(f"No 'about' attribute found in sameAs element for instance {instance_uri}.")
                    else:
                        print(f"No sameAs element found for instance {instance_uri}.")
                else:
                    print("No 'about' attribute found in Instance element.")
            else:
                print("No Instance element found in RDF element.")
        else:
            print("No RDF element found in recordData element.")
    else:
        print("No recordData element found in record element.")

print("Collected instance URIs and their corresponding sameAs URIs:", instance_dict)
print(f"Total instances collected: {len(instance_dict)}")

Collected instance URIs and their corresponding sameAs URIs: {'https://api.stage.sinopia.io/resource/4ac80e0a-d28d-458d-bd7f-5f808023af8d': 'https://na03.alma.exlibrisgroup.com/bf/instances/9979461923903681?institute=01UPENN_INST', 'https://api.stage.sinopia.io/resource/6dff794b-965b-424f-abcb-cbe16ab9e260': 'https://na03.alma.exlibrisgroup.com/bf/instances/9979515924303681?institute=01UPENN_INST', 'https://api.stage.sinopia.io/resource/f355280e-5b61-47f2-9df3-56e5e825c459': 'https://na03.alma.exlibrisgroup.com/bf/instances/9979493224403681?institute=01UPENN_INST', 'https://api.stage.sinopia.io/resource/cbefd65b-29c6-4f08-b456-543281fdaddd': 'https://na03.alma.exlibrisgroup.com/bf/instances/9979461924203681?institute=01UPENN_INST', 'https://api.stage.sinopia.io/resource/aecc9f22-83be-444a-9f86-99514955e699': 'https://na03.alma.exlibrisgroup.com/bf/instances/9979497424103681?institute=01UPENN_INST', 'https://api.stage.sinopia.io/resource/01a226e9-ed48-4185-ad77-94c1723a3fec': 'https://n

# Collect the Instance data

In [3]:
import lxml.etree as ET
from marc_xml.lc_bfxml_work import lc_bfxml_work, remove_last_line
from marc_xml.lc_bfxml_instance import lc_bfxml_instance, remove_rdf_header

In [4]:
import os
from lxml import etree as ET
from lxml.etree import QName

def rdf2marcxml(instance_dict):
    # Ensure the export directory exists
    export_dir = 'oclc_export'
    os.makedirs(export_dir, exist_ok=True)
    
    for instance_uri, same_as_uri in instance_dict.items():
        # Quality check the file
        
        lc_bfxml_work(instance_uri)  
        remove_last_line() 
        lc_bfxml_instance(instance_uri) 
        remove_rdf_header() 
        
        # Combine the two files, work first
        with open("bfxml_work.xml", "r") as work_file:
            work = work_file.read()
        with open("lc_bfxml_instance.xml", "r") as instance_file:
            instance = instance_file.read()
        
        combined_content = work + instance

        # Save as a file
        combined_file_path = "LoC_Work_Instance.xml"
        with open(combined_file_path, "w") as combined_file:
            combined_file.write(combined_content)

        # Add the sinopiabf and owl namespaces to the combined file
        with open(combined_file_path, "r") as file:
            filedata = file.read()
        filedata = filedata.replace('<rdf:RDF', '<rdf:RDF xmlns:sinopiabf="http://sinopia.io/vocabulary/bf/" xmlns:owl="http://www.w3.org/2002/07/owl#"')
        with open(combined_file_path, "w") as file:
            file.write(filedata)

        # Parse the combined XML
        dom = ET.parse(combined_file_path)
        root = dom.getroot()

        # Define namespaces
        namespaces = {
            'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
            'bf': "http://id.loc.gov/ontologies/bibframe/",
            'owl': "http://www.w3.org/2002/07/owl#"
        }

        # Insert the sameAs link as <owl:SameAs rdf:resource="uri_value"/>
        owl_same_as = ET.Element("{http://www.w3.org/2002/07/owl#}SameAs", attrib={QName(namespaces['rdf'], "resource"): same_as_uri})
        instance_element = root.find('.//bf:Instance[@rdf:about="' + instance_uri + '"]', namespaces=namespaces)
        if instance_element is not None:
            instance_element.append(owl_same_as)
        else:
            print(f"Instance element not found for URI: {instance_uri}")

        # Extract the unique ID from the Sinopia URI
        unique_id = instance_uri.split('/')[-1]

        # Save the updated XML with the unique ID as the file name
        updated_file_path = os.path.join(export_dir, f"{unique_id}.xml")
        with open(updated_file_path, "wb") as f:
            f.write(ET.tostring(root, pretty_print=True, encoding="utf-8"))

        # Apply "pre-transform-normalize.xsl" for normalization
        xslt = ET.parse("marc_xml/xsl/pre-transform-normalize.xsl")
        transform = ET.XSLT(xslt)
        newdom = transform(dom)
        
        normalized_file_path = os.path.join(export_dir, f"{unique_id}_normalized.rdf")
        with open(normalized_file_path, "wb") as f:
            f.write(ET.tostring(newdom, pretty_print=True, encoding="utf-8"))

        print(f"Processed and saved: {normalized_file_path}")

# Expected data model
# instance_dict = {
#     "https://api.stage.sinopia.io/resource/4ac80e0a-d28d-458d-bd7f-5f808023af8d": "https://na03.alma.exlibrisgroup.com/bf/instances/9979461923903681?institute=01UPENN_INST",
#     "https://api.stage.sinopia.io/resource/6dff794b-965b-424f-abcb-cbe16ab9e260": "https://na03.alma.exlibrisgroup.com/bf/instances/9979515924303681?institute=01UPENN_INST"
# }
rdf2marcxml(instance_dict)

Processed and saved: oclc_export/4ac80e0a-d28d-458d-bd7f-5f808023af8d_normalized.rdf
Processed and saved: oclc_export/6dff794b-965b-424f-abcb-cbe16ab9e260_normalized.rdf
Processed and saved: oclc_export/f355280e-5b61-47f2-9df3-56e5e825c459_normalized.rdf
Processed and saved: oclc_export/cbefd65b-29c6-4f08-b456-543281fdaddd_normalized.rdf
Processed and saved: oclc_export/aecc9f22-83be-444a-9f86-99514955e699_normalized.rdf
Processed and saved: oclc_export/01a226e9-ed48-4185-ad77-94c1723a3fec_normalized.rdf
Processed and saved: oclc_export/13161c2c-f3ce-457f-98e4-b793527c3a57_normalized.rdf


  self.predicate(predicate, object, depth + 1)


Processed and saved: oclc_export/b75d97b0-5528-4933-8ffd-312f74aad719_normalized.rdf
Processed and saved: oclc_export/16ae6996-a268-49f3-a6af-b15f1a55e7b0_normalized.rdf
Processed and saved: oclc_export/3967aa17-44f6-4bc8-b50e-bfbe4d99ef4b_normalized.rdf
Processed and saved: oclc_export/502b7a2a-75bf-481b-85f3-07029c27f6f7_normalized.rdf
Processed and saved: oclc_export/22f7ede6-498f-4be3-94d6-af95977365a2_normalized.rdf
Processed and saved: oclc_export/a945a191-b04a-4549-a8b5-ea8d29ba0ae9_normalized.rdf
Processed and saved: oclc_export/72c517ed-56f6-4c68-8244-46ed9089da93_normalized.rdf


  self.predicate(predicate, object, depth + 1)


Processed and saved: oclc_export/45fe6ec4-81a8-4df3-a75c-1046bf97f3e2_normalized.rdf
Processed and saved: oclc_export/e96616a1-33db-4438-bc59-dad7c9fbab4e_normalized.rdf
Processed and saved: oclc_export/2dea0ef5-17b2-45f2-8327-1d3de839392f_normalized.rdf
Processed and saved: oclc_export/10352dc4-dd9c-402d-9957-5925f5868a97_normalized.rdf
Processed and saved: oclc_export/2b09bb49-549d-4851-9a52-5feb0abc0843_normalized.rdf
Processed and saved: oclc_export/23f4e924-dcbb-4346-a010-f85e95d96b73_normalized.rdf
Processed and saved: oclc_export/62102f4a-4ea1-4bbd-8ae2-bb9f26f142e8_normalized.rdf
Processed and saved: oclc_export/aba33721-6078-412e-96ae-8c27895646ea_normalized.rdf


# Normalize the data

In [5]:
import os
from lxml import etree

def apply_final_xslt(instance_dict):
    # Ensure the export directory exists
    export_dir = 'oclc_export'
    os.makedirs(export_dir, exist_ok=True)

    # Load the XSLT file
    xslt_file = 'xsl/rdf2oclc.xsl'
    xslt_doc = etree.parse(xslt_file)
    transform = etree.XSLT(xslt_doc)

    for instance_uri in instance_dict.keys():
        # Extract the unique ID from the Sinopia URI
        unique_id = instance_uri.split('/')[-1]

        # Load the input RDF/XML content
        input_file = os.path.join(export_dir, f"{unique_id}_normalized.rdf")
        with open(input_file, 'r') as f:
            input_xml = f.read()

        # Parse the input XML
        input_doc = etree.fromstring(input_xml)

        # Apply the transformation
        result_doc = transform(input_doc)

        # Save the transformed XML to a file with the unique ID
        output_file = os.path.join(export_dir, f"{unique_id}_final.rdf")
        with open(output_file, 'wb') as f:
            f.write(etree.tostring(result_doc, pretty_print=True, encoding='UTF-8'))

        # Print the transformed XML
        print(f"Transformed XML for {unique_id}:")
        print(etree.tostring(result_doc, pretty_print=True, encoding='UTF-8').decode('UTF-8'))

# Expected data model 
# instance_dict = {
#     "https://api.stage.sinopia.io/resource/4ac80e0a-d28d-458d-bd7f-5f808023af8d": "https://na03.alma.exlibrisgroup.com/bf/instances/9979461923903681?institute=01UPENN_INST",
#     "https://api.stage.sinopia.io/resource/6dff794b-965b-424f-abcb-cbe16ab9e260": "https://na03.alma.exlibrisgroup.com/bf/instances/9979515924303681?institute=01UPENN_INST"
# }
apply_final_xslt(instance_dict)

Transformed XML for 4ac80e0a-d28d-458d-bd7f-5f808023af8d:
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:bf="http://id.loc.gov/ontologies/bibframe/" xmlns:bflc="http://id.loc.gov/ontologies/bflc/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:cc="http://creativecommons.org/ns#" xmlns:foaf="http://xmlns.com/foaf/0.1" xmlns:sinopia="http://sinopia.io/vocabulary/">
  <bf:Instance rdf:about="https://api.stage.sinopia.io/resource/4ac80e0a-d28d-458d-bd7f-5f808023af8d">
    <rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Print"/>
    <bf:identifiedBy>
      <bf:Lccn>
        <rdf:value>64056569</rdf:value>
      </bf:Lccn>
    </bf:identifiedBy>
    <bf:identifiedBy>
      <bf:Local>
        <rdf:value>3270700</rdf:value>
        <bf:assigner rdf:resource="http://id.loc.gov/vocabulary/organizations/ocolc"/>
    