<a href="https://colab.research.google.com/github/hsandaver/hsandaver/blob/main/marctobibframeconvertor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pymarc rdflib pyshacl

from google.colab import files
print("Please upload your MARC file (binary .mrc format):")
uploaded = files.upload()  # User uploads the MARC file

from pymarc import MARCReader
from rdflib import Graph, Namespace, URIRef, Literal, RDF

# Define namespaces
BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
RDA = Namespace("http://rdaregistry.info/Elements/")
EX = Namespace("http://example.org/record/")

graph = Graph()
graph.bind("bf", BF)
graph.bind("ex", EX)
graph.bind("rda", RDA)

print("Processing the MARC records...")

for filename in uploaded.keys():
    with open(filename, 'rb') as fh:
        reader = MARCReader(fh)
        for record in reader:
            if record['001']:
                record_id = record['001'].value().strip()
            else:
                record_id = str(hash(record))

            work_uri = URIRef(EX[f"work/{record_id}"])
            instance_uri = URIRef(EX[f"instance/{record_id}"])

            # Link Instance to Work
            graph.add((instance_uri, BF.instanceOf, work_uri))

            # Admin metadata fields
            if record['003']:
                graph.add((instance_uri, BF.adminMetadata, Literal("Control Number Identifier: " + record['003'].value().strip())))
            if record['005']:
                graph.add((instance_uri, BF.adminMetadata, Literal("Date and Time of Latest Transaction: " + record['005'].value().strip())))
            if record['008']:
                graph.add((instance_uri, BF.adminMetadata, Literal("Fixed-Length Data Elements: " + record['008'].value().strip())))

            # Leader might also be stored as admin data
            graph.add((instance_uri, BF.adminMetadata, Literal("Leader: " + str(record.leader).strip())))

            # 007 fields as admin metadata notes
            for f in record.get_fields('007'):
                graph.add((instance_uri, BF.adminMetadata, Literal("Physical Characteristics (007): " + f.value())))

            # 040 Cataloging source
            if record['040']:
                vals = record['040'].get_subfields()
                val = "; ".join(v.strip() for v in vals if v and v.strip())
                graph.add((instance_uri, BF.adminMetadata, Literal("Cataloging Source: " + val)))

            # Title (245)
            if '245' in record:
                title_field = record['245']
                title_a = title_field['a'] if 'a' in title_field else ""
                title_b = title_field['b'] if 'b' in title_field else ""
                title_c = title_field['c'] if 'c' in title_field else ""
                full_title = (title_a + " " + title_b + " " + title_c).strip()
                graph.add((work_uri, BF.title, Literal(full_title.strip())))

            # Author (100)
            if '100' in record and 'a' in record['100']:
                author_field = record['100']
                author_name = author_field['a'].strip()
                person_uri = None

                # Check $1 and $0 for URI
                one_val = author_field.get_subfields('1')
                zero_val = author_field.get_subfields('0')

                if one_val and one_val[0].startswith('http'):
                    person_uri = URIRef(one_val[0].strip())
                elif zero_val and zero_val[0].startswith('http'):
                    person_uri = URIRef(zero_val[0].strip())
                else:
                    person_uri = URIRef(EX[f"person/{hash(author_name)}"])

                graph.add((person_uri, BF.label, Literal(author_name)))
                graph.add((work_uri, BF.contributor, person_uri))

            # Edition (250)
            if '250' in record and 'a' in record['250']:
                edition = record['250']['a'].strip()
                graph.add((instance_uri, BF.editionStatement, Literal(edition)))

            # Publication info (264)
            for field264 in record.get_fields('264'):
                place = field264['a'] if 'a' in field264 else None
                publisher = field264['b'] if 'b' in field264 else None
                date = field264['c'] if 'c' in field264 else None
                provision_uri = URIRef(EX[f"provision/{record_id}/{hash(field264)}"])
                if place:
                    graph.add((provision_uri, BF.place, Literal(place.strip())))
                if publisher:
                    graph.add((provision_uri, BF.agent, Literal(publisher.strip())))
                if date:
                    graph.add((provision_uri, BF.date, Literal(date.strip())))
                graph.add((instance_uri, BF.provisionActivity, provision_uri))

            # Physical description (300)
            if '300' in record:
                extent_parts = []
                for sf_code in ['a','b','c']:
                    if sf_code in record['300']:
                        extent_parts.append(record['300'][sf_code].strip())
                extent_str = "; ".join(extent_parts)
                if extent_str:
                    graph.add((instance_uri, BF.extent, Literal(extent_str)))

            # 334 - Mode of Issuance
            for f334 in record.get_fields('334'):
                if 'a' in f334:
                    graph.add((work_uri, BF.modeOfIssuance, Literal(f334['a'].strip())))

            # 336 (Content), 337 (Media), 338 (Carrier)
            for f336 in record.get_fields('336'):
                if 'a' in f336:
                    graph.add((work_uri, BF.content, Literal(f336['a'].strip())))
            for f337 in record.get_fields('337'):
                if 'a' in f337:
                    graph.add((instance_uri, BF.media, Literal(f337['a'].strip())))
            for f338 in record.get_fields('338'):
                if 'a' in f338:
                    graph.add((instance_uri, BF.carrier, Literal(f338['a'].strip())))

            # 340 - Physical Medium
            for f340 in record.get_fields('340'):
                sub_vals = f340.get_subfields()  # get all subfield values as strings
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.note, Literal("Physical Medium: " + val)))

            # 347 - Digital file characteristics
            for f347 in record.get_fields('347'):
                sub_vals = f347.get_subfields()
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.note, Literal("Digital File Characteristics: " + val)))

            # 380 - Form of work
            for f380 in record.get_fields('380'):
                if 'a' in f380:
                    graph.add((work_uri, BF.genreForm, Literal(f380['a'].strip())))

            # 388 - Time period of creation
            for f388 in record.get_fields('388'):
                if 'a' in f388:
                    graph.add((work_uri, BF.temporalCoverage, Literal(f388['a'].strip())))

            # 500 - General notes
            for f500 in record.get_fields('500'):
                sub_vals = f500.get_subfields()
                note_val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if note_val:
                    graph.add((instance_uri, BF.note, Literal(note_val)))

            # 506 - Restrictions on Access
            for f506 in record.get_fields('506'):
                sub_vals = f506.get_subfields()
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.usageAndAccessPolicy, Literal(val)))

            # 561 - Provenance
            for f561 in record.get_fields('561'):
                sub_vals = f561.get_subfields()
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.provenance, Literal(val)))

            # 583 - Action notes
            for f583 in record.get_fields('583'):
                sub_vals = f583.get_subfields()
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.note, Literal("Action: " + val)))

            # 655 - Genre/Form
            for f655 in record.get_fields('655'):
                if 'a' in f655:
                    genre_val = f655['a'].strip()
                    one_val = f655.get_subfields('1')
                    genre_uri = None
                    if one_val and one_val[0].startswith('http'):
                        genre_uri = URIRef(one_val[0].strip())
                    if genre_uri:
                        graph.add((work_uri, BF.genreForm, genre_uri))
                        graph.add((genre_uri, BF.label, Literal(genre_val)))
                    else:
                        graph.add((work_uri, BF.genreForm, Literal(genre_val)))

            # 700 - Additional contributors
            for f700 in record.get_fields('700'):
                if 'a' in f700:
                    contrib_name = f700['a'].strip()
                    contrib_uri = None
                    one_val = f700.get_subfields('1')
                    zero_val = f700.get_subfields('0')
                    if one_val and one_val[0].startswith('http'):
                        contrib_uri = URIRef(one_val[0].strip())
                    elif zero_val and zero_val[0].startswith('http'):
                        contrib_uri = URIRef(zero_val[0].strip())
                    else:
                        contrib_uri = URIRef(EX[f"person/{hash(contrib_name)}"])

                    graph.add((contrib_uri, BF.label, Literal(contrib_name)))
                    graph.add((work_uri, BF.contributor, contrib_uri))

            # 773 - Host Item Entry
            for f773 in record.get_fields('773'):
                host_title = f773['t'] if 't' in f773 else None
                host_uri = None
                if 'w' in f773:
                    w_val = f773['w'].strip()
                    host_uri = URIRef(EX["work/host/"+w_val.replace("(","").replace(")","").replace("OCoLC","oclc")])
                else:
                    host_uri = URIRef(EX[f"work/host/{hash(host_title)}"]) if host_title else URIRef(EX[f"work/host/{hash(record_id)}"])

                if host_title:
                    graph.add((host_uri, BF.title, Literal(host_title.strip())))
                graph.add((work_uri, BF.partOf, host_uri))

            # 856 - Electronic access
            for f856 in record.get_fields('856'):
                if 'u' in f856:
                    url = f856['u'].strip()
                    electronic_uri = URIRef(url)
                    graph.add((instance_uri, BF.electronicLocator, electronic_uri))
                    if '3' in f856:
                        graph.add((electronic_uri, BF.label, Literal(f856['3'].strip())))
                    if 'y' in f856:
                        graph.add((electronic_uri, BF.label, Literal(f856['y'].strip())))

            # 994 local field
            for f994 in record.get_fields('994'):
                sub_vals = f994.get_subfields()
                val = " ".join(v.strip() for v in sub_vals if v and v.strip())
                if val:
                    graph.add((instance_uri, BF.note, Literal("Local Processing: " + val.strip())))

print("MARC to BIBFRAME conversion completed.")

# Serialize the RDF graph to a Turtle file
output_file = "output_bibframe.ttl"
print(f"Serializing the RDF graph to {output_file}...")
serialized_ttl = graph.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f:
    f.write(serialized_ttl)

print(f"Conversion complete. Downloading {output_file}...")
files.download(output_file)

print("Downloading BIBFRAME shapes for SHACL validation...")
!wget https://raw.githubusercontent.com/lcnetdev/bibframe-shapes/master/shapes/BIBFRAME-shapes.ttl -O bibframe-shapes.ttl

from pyshacl import validate

shapes_graph = Graph()
shapes_graph.parse("bibframe-shapes.ttl", format="turtle")

if len(graph) == 0:
    print("Warning: The RDF graph is empty. Validation may fail or produce no output.")

print("Validating RDF against BIBFRAME SHACL shapes. This may take a moment...")
conforms, results_graph, results_text = validate(
    data_graph=graph,
    shacl_graph=shapes_graph,
    inference="rdfs",
    debug=True
)

if conforms:
    print("RDF conforms to BIBFRAME specifications. No errors found.")
else:
    print("Validation errors detected:")
    print(results_text)