Taking as input the data generated by the script ```cm_tagme_resource_reference_data```, this script produces RDF triples for pages in the cm_tagme dataset

In [1]:
import os, pickle, json, pprint
from rdflib import Graph, URIRef, Namespace, Literal, XSD
from rdflib.namespace import RDF, OWL

In [2]:
with open('/data/cm/output/generators/cm_tagme_pages_data.pickle', 'rb') as infile:
    cm_tagme_pages_data = pickle.load(infile)

In [3]:
cm_tagme_pages = Graph()

In [4]:
# define Namespaces
jl = Namespace("http://data.judaicalink.org/ontology/")
# bindings
cm_tagme_pages.bind('jl', jl)

In [5]:
print(cm_tagme_pages_data[0])
for a in cm_tagme_pages_data:
    if a['journal_name'] == '':
        print(a)
        break

{'full_page': '2710055-2710056-2710057--019-2710121', 'page': '2710121', 'journal': '2710055', 'journal_name': 'Der neue Anfang', 'issue': '2710057'}


In [6]:
# generate page, journal and visual representation triples
datapoint_count = 0
for datapoint in cm_tagme_pages_data:
    datapoint_count += 1
    page = URIRef("http://data.judaicalink.org/data/compact-memory/"+datapoint['full_page'])
    journal = URIRef("http://data.judaicalink.org/data/compact-memory/"+datapoint['journal'])
    journal_title = Literal(datapoint['journal_name'], datatype=XSD.string)
    if datapoint['issue'] != '':
        issue = URIRef("http://data.judaicalink.org/data/compact-memory/"+datapoint['issue'])
    else:
        issue = None
    pageview = URIRef("http://sammlungen.ub.uni-frankfurt.de/cm/periodical/pageview/"+datapoint['page'])
    journalview = URIRef("http://sammlungen.ub.uni-frankfurt.de/cm/periodical/pageview/"+datapoint['journal'])

    # populate graph
    if issue != None:
        cm_tagme_pages.add((page, jl.belongsToIssue, issue))
        cm_tagme_pages.add((issue, jl.belongsToJournal, journal))
    cm_tagme_pages.add((journal, jl.title, journal_title))


    cm_tagme_pages.add((page, jl.hasVisualRepresentation, pageview))
    cm_tagme_pages.add((journal, jl.hasVisualRepresentation, journalview))

In [7]:
print("Processed {} datapoints.".format(datapoint_count))
print("Generated {} triples in this dataset.".format(len(cm_tagme_pages)))

Processed 747760 datapoints.
Generated 624682 triples in this dataset.


In [8]:
cm_tagme_pages.serialize(destination="/data/cm/output/generators/cm_pages.ttl", format="ttl")