# Build Notebook
Build Notebook for [Project 1](https://github.com/jermnelson/linked-data-fragments/projects/1)

## Example One: Colorado College Knowledge Graphs

To run this example, **BASE_DIR** should be the root directory where you have cloned the following git repositories:

1.  `git clone https://github.com/jermnelson/linked-data-fragments.git`
1.  `git clone https://github.com/Tutt-Library/tiger-catalog.git`
1.  `git clone https://github.com/Tutt-Library/cc-scholarship-graph.git`


In [31]:
import datetime, rdflib, pdb, sys, hashlib, requests
import os, sys, rdflib, uuid

PROJECT_DIR = os.path.split(os.path.abspath(os.curdir))[0]
BASE_DIR = os.path.split(BASE_DIR)[0]
CACHE_DIR = os.path.join(BASE_DIR, "cache")
sys.path.append(PROJECT_DIR)
print(sys.platform, os.path.os.name, BASE_DIR, PROJECT_DIR)
from bplustree import BPlusTree, StrSerializer
import btree
# Creating Colorado College Knowledge Graph
CC = rdflib.Graph()
def load_cc_scholarship():
	for path in ["/tiger-catalog/KnowledgeGraph/colorado-college.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-people.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-2016-2017.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-2017-2018.ttl",
		 "/cc-scholarship-graph/data/cc-research-statements.ttl",
		 "/cc-scholarship-graph/data/cc-fast-subjects.ttl",
		 "/cc-scholarship-graph/data/creative-works.ttl"]:
		full_path = "{}{}".format(BASE_DIR, path)        
		CC.parse(full_path, format='turtle')
load_cc_scholarship()
print("Colorado College Number of Triples: {:,}".format(len(CC)))

win32 nt D:\2018 D:\2018\linked-data-fragments
Colorado College Number of Triples: 28,122


### Demo of BTree Linked Data Fragments 
Take subject, predicate, object triples from RDF Graph and create sha1 hashes of all Blank Nodes and IRI subjects and store in BTree. 

In [32]:
CC_TREE = BPlusTree("{}tmp/cc-tree.db".format(PROJECT_DIR),
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

In [33]:
start = datetime.datetime.utcnow()
print("Starting loading Colorado College's {:,} triples at {} ".format(
    len(CC),
    start.isoformat()))
counter = 0
for subj, pred, obj in CC:
    subj_sha1 = hashlib.sha1(str(subj).encode()).hexdigest()
    pred_sha1 = hashlib.sha1(str(pred).encode()).hexdigest()
    obj_sha1 = hashlib.sha1(str(obj).encode()).hexdigest()
    # Add/Create Subject, Predicate, and Object
    if not subj_sha1 in CC_TREE:
        CC_TREE.insert(subj_sha1, str(subj).encode())
    if not pred_sha1 in CC_TREE:
        CC_TREE.insert(pred_sha1, str(pred).encode())
    if not obj_sha1 in CC_TREE:
        CC_TREE.insert(obj_sha1, str(obj).encode())
    btree.add_patterns(CC_TREE, subj_sha1, pred_sha1, obj_sha1)
    if not counter%100 and counter > 0:
        print(".", end="")
        CC_TREE.checkpoint()
    if not counter%1000:
        print("{:,}".format(counter), end="")
    counter += 1
        
end = datetime.datetime.utcnow()
CC_TREE.close()
print("Finished loading at {}, total time {:,} min for {:,} triples".format(
    end,
    (end-start).seconds / 60.0,
    len(CC)))

Starting loading Colorado College's 28,122 triples at 2018-05-22T16:38:33.903752 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000.............................................................................

In [36]:
first_result = """Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000....................................................................................................9,000....................................................................................................10,000....................................................................................................11,000....................................................................................................12,000....................................................................................................13,000....................................................................................................14,000....................................................................................................15,000....................................................................................................16,000....................................................................................................17,000....................................................................................................18,000....................................................................................................19,000....................................................................................................20,000....................................................................................................21,000....................................................................................................22,000....................................................................................................23,000....................................................................................................24,000....................................................................................................25,000....................................................................................................26,000....................................................................................................27,000...................................................................................................Finished loading at 2018-05-21 22:18:58.331714, total time 10.916666666666666 min for 27,992 triples
"""
print(first_result)
second_result = """Starting loading Colorado College's 28,122 triples at 2018-05-22T16:38:33.903752 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000....................................................................................................9,000....................................................................................................10,000....................................................................................................11,000....................................................................................................12,000....................................................................................................13,000....................................................................................................14,000....................................................................................................15,000....................................................................................................16,000....................................................................................................17,000....................................................................................................18,000....................................................................................................19,000....................................................................................................20,000....................................................................................................21,000....................................................................................................22,000....................................................................................................23,000....................................................................................................24,000....................................................................................................25,000....................................................................................................26,000....................................................................................................27,000....................................................................................................28,000............Finished loading at 2018-05-22 16:49:19.265349, total time 10.75 min for 28,122 triples
"""
print(second_result)

Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000.............................................................................

## Example Two: Local BTree Strategies

In [45]:
from bplustree import BPlusTree, StrSerializer
TEST_TREE = BPlusTree("/home/jpnelson/2018/linked-data-fragments/tmp/test-tree.db",
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

ValueError: No directory /home/jpnelson/2018/linked-data-fragments/tmp

In [41]:
prefix = """@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix relators: <http://id.loc.gov/vocabulary/relators/> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."""

sample_one = prefix + """

<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-18T01:49:24.602657" ;
            rdf:value "Generated by BIBCAT version 1.13.0 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <http://www.uwyo.edu/ahc/> ;
    bf:itemOf <https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .
    
<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> a bf:Instance ;
    rdfs:label "Granite Bluff, 1900" ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg> ] ;
    bf:instanceOf <https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> ;
    bf:subject [ a bf:Topic ;
            rdf:value "Knight, Samuel H., (Samuel Howell), 1892-1975" ],
        [ a bf:Topic ;
            rdf:value "Geology" ],
        [ a bf:Topic ;
            rdf:value "Wyoming--History--1890-" ] ;
    bf:title [ a bf:Title ;
            rdf:value "Granite Bluff, 1900" ] .

<https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> a bf:Work ;
    bf:partOf <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~148~148>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~22~22>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~96~96> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Photograph of Granite Bluff, below and east of tunnel, October 1900" ] ;
    bf:temporalCoverage "1900" .

    """

sample_two = prefix + """<http://cdm16079.contentdm.oclc.org:80/cdm/ref/collection/p15330coll22/id/32926> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <https://www.denverlibrary.org/> ;
    bf:itemOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> a bf:Instance ;
    bf:carrier [ a bf:Carrier ;
            rdf:value "Photographic prints." ],
        [ a bf:Carrier ;
            rdf:value "Imaged." ],
        [ a bf:Carrier ;
            rdf:value "Image" ],
        [ a bf:Carrier ;
            rdf:value "Nitrate negatives." ] ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926> ] ;
    bf:extent [ a bf:Extent ;
            rdf:value "1 photonegative : nitrate ; 15 x 10 cm. (6 x 4 in.); 1 photoprint ; 15 x 10 cm. (6 x 3 1/2 in.)"^^xsd:string ] ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:identifiedBy [ a bf:oclc ;
            rdf:value "40811329"^^xsd:string ],
        [ a bf:Local ;
            rdf:value "X-33358"^^xsd:string ] ;
    bf:instanceOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> ;
    bf:media [ a bf:Media ;
            rdf:value "Photograph"^^xsd:string ] ;
    bf:note [ a bf:Note ;
            bf:noteType "admin"@en ] ;
    bf:partOf <https://plains2peaks.org/wh231-joseph-g-masters-papers> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Outdoor seated portrait of the wife of Wooden Leg, a seventy-eight year old Native American Northern Cheyenne woman, wearing a long cotton dress and braids. She was eighteen and in camp at time of the Little Bighorn."^^xsd:string ] ;
    bf:title [ a bf:Title ;
            bf:mainTitle "Mrs. Wooden Leg, of Manderson, 18 yrs. old & in camp at time of Custer Battle"^^xsd:string ] .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> a bf:StillImage,
        bf:Work ;
    bf:changeDate "12/23/2010" ;
    bf:contribution [ a bf:Contribution ;
            bf:role relators:cre ;
            rdf:value "Masters, Joseph G., 1873-" ] ;
    bf:subject [ a bf:Temporal ;
            rdf:value "1936" ],
        <https://plains2peaks.org/agent/wooden-leg-mrs>,
        <https://plains2peaks.org/topic/aged-persons-1930-1940>,
        <https://plains2peaks.org/topic/cheyenne-indians-1930-1940>,
        <https://plains2peaks.org/topic/indians-of-north-america-1930-1940>,
        <https://plains2peaks.org/topic/little-bighorn-battle-of-the-mont-1876-veterans>,
        <https://plains2peaks.org/topic/women-1930-1940>,
        <https://plains2peaks.org/topic/wooden-leg-1858-family> . """




In [42]:
example1, example2 = rdflib.Graph(), rdflib.Graph()
example1.parse(data=sample_one, format='turtle')
example2.parse(data=sample_two, format='turtle')

<Graph identifier=N4fa9f41ffd1640ce96c56f3cece57812 (<class 'rdflib.graph.Graph'>)>

In [38]:
# Tool chain for converting MODS to BIBFRAME Metadata
item_url="https://digitalcc.coloradocollege.edu/islandora/object/coccc:29797/"
mods_url = "{}datastream/MODS".format(item_url)
mods_result = requests.get(mods_url)
example3 = rdflib.Graph()
from bibcat.rml import processor
BASE_URL = 'https://tiger.coloradocollege.edu/'
if sys.platform.startswith("win"):
    cc_rule = 'D:/2018/dpla-service-hub/profiles/colorado-college.ttl'
elif sys.platform.startswith("linux"):
    cc_rule = "/home/jpnelson/2018/dpla-service-hub/profiles/colorado-college.ttl"
mods2bf = processor.XMLProcessor(
    rml_rules=['mods-to-bf.ttl', 
               'bibcat-base.ttl',
               cc_rule],
    namespaces={"mods": "http://www.loc.gov/mods/v3",
                    "xlink": "https://www.w3.org/1999/xlink"},
    base_url=BASE_URL,
    triplestore_url='http://localhost:9999/blazegraph/sparql')
instance_iri = rdflib.URIRef("{}{}".format(BASE_URL, uuid.uuid1()))
work_iri = rdflib.URIRef("{}#Work".format(instance_iri))
mods2bf.run(xml=mods_result.text,
            item_iri=rdflib.URIRef(item_url),
            instance_iri=instance_iri,
            work_iri=work_iri)
example3 = mods2bf.output

In [39]:
print(example3.serialize(format='turtle').decode())

@prefix adms: <http://www.w3.org/ns/adms#> .
@prefix bc: <http://knowledgelinks.io/ns/bibcat/> .
@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix kds: <http://knowledgelinks.io/ns/data-structures/> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix mods: <http://www.loc.gov/mods/v3> .
@prefix oslo: <http://purl.org/oslo/ns/localgov#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix ql: <http://semweb.mmlab.be/ns/ql#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix relators: <http://id.loc.gov/vocabulary/relators/> .
@prefix rml: <http://semweb.mmlab.be/ns/rml#> .
@prefix rr: <http://www.w3.org/ns/r2rml#> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001

![Example One Thumbnail](http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg)

![Example_Two Thumbnail](http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926)

In [None]:
print(example1.serialize(format='nt').decode())

In [44]:
import hashlib, pickle
count = 0
for s,p,o in example1:
    print(s,p,o)
    if count >= 10:
        break
    count += 1
    

https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6 http://id.loc.gov/ontologies/bibframe/subject ub55bL23C16
http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236 http://id.loc.gov/ontologies/bibframe/usageAndAccessPolicy http://rightsstatements.org/vocab/CNE/1.0/
https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work http://id.loc.gov/ontologies/bibframe/partOf http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~22~22
https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6 http://id.loc.gov/ontologies/bibframe/title ub55bL29C14
https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://id.loc.gov/ontologies/bibframe/Instance
ub55bL29C14 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://id.loc.gov/ontologies/bibframe/Title
ub55bL11C26 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://id.loc.gov/ontologies/bibframe/GenerationProcess
ub55bL36C16 http://www.w

In [None]:
def add_key(tree, value):
    if not isinstance(value, bytes):
        value = value.encode()
    key = hashlib.sha1(value).hexdigest()
    if not key in tree:
        tree.insert(key, value)
    return key

def add_collections(tree, 
                 subject_sha1, 
                 predicate_sha1,
                 object_sha1):
    p_o_key = "{}:{}".format(predicate_sha1, object_sha1)
    s_p_key = "{}:{}".format(subject_sha1, predicate_sha1)
    for key in [p_o_key, s_p_key]:
        entities = []
        if key in tree:
            entities = pickle.loads(
                tree.get(key))
        
    
    
    
        
def quick_cache(tree, fields):
    subject_sha1 = add_key(tree, fields[0]) 
    predicate_sha1 = add_key(tree, fields[1])
    object_sha1 = add_key(tree, fields[2])
    subject_predicate = "{}:{}".format(subject_sha1,
                                      predicate_sha1)
    objects = []
    if subject_predicate in tree:
        objects = pickle.loads(tree.get(subject_predicate))
    objects.append(object_sha1)
    tree[subject_predicate] = pickle.dumps(list(set(objects)))
    wildcard_predicate = "*:{}".format(predicate_sha1)
    if wildcard_predicate in tree:
        wildcard_subjects
                               
    
    
    

In [None]:
import urllib.parse
import re, tempfile, uuid
triple_pattern = re.compile(r"(\w+)>(\w+)>(\w+)")

class TriplePatternSelector(object):
    
    def __init__(self, **kwargs):
        self.subject_selector = kwargs.get("subject", "?subject")
        self.predicate_selector = kwargs.get("predicate", "?predicate")
        self.object_selector = kwargs.get("object", "?object")
        self.db_tree = kwargs.get("btree")
        if not self.db_tree:
            self.db_path = kwargs.get("db_path")
            if not self.db_path:
                self.temp_file = tempfile.TemporaryFile()
                self.db_path = self.temp_file.name
            self.db_tree = BPlusTree(
                self.db_path,
                serializer=StrSerializer(),
                key_size=124
            )
        self.data = []
        # Sets URI for selector 
        self.base_url = kwargs.get('base_url', 'http://localhost:7000')
    
    @property
    def uri(self):
        return urllib.parse.urljoin(
            self.base_url,
            str(uuid.uuid1()))
    
    @property
    def metadata(self):
        return {
            "subject": self.uri,
            "predicate": "void:triples",
            "object": len(self.data)
        }
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if len(self.data) >= 10:
            raise StopIteration
        result = []
        print(self.subject_selector, self.subject_selector.startswith("?subject"))
        if not self.subject_selector.startswith("?subject"):
            subject_key = hashlib.sha1(
                str(self.subject_selector).encode()).hexdigest()
            for key in self.db_tree[subject_key:]:
                if not key.startswith(subject_key):
                    raise StopIteration
                triple_result = triple_pattern.search(key)
                print(key, triple_result)
                if triple_result:
                    triples = triple_result.groups()
                    result.append({"s": self.subject_selector,
                                   "p": self.db_tree.get(triples[1]).decode(),
                                   "o": self.db_tree.get(triples[2]).decode()})
                              
                
                
        #if not self.predicate_selector.startswith("?predicate"):
        #    key, value = yield self.db_tree.items(slice(start=self.predicate_selector))
        #    result['p'] =  value
        #if not self.object_selector.startswith("?object"):
        #key, value = yield self.db_tree.items(slice(start=self.object_selector))
         #   result['o'] = value
        self.data.extend(result)
        return result
    
    def __del__(self):
        if hasattr(self, "temp_file"):
            self.temp_file.close()
        if hasattr(self, "db_path"):
            self.db_tree.close()
        
        
class BtreeTriplePatternFragment(object):

	def __init__(self, **kwargs):
		self.db_location = kwargs.get("database")
		self.db_tree = BPlusTree(
			self.db_location,
			serializer=StrSerializer(),
			key_size=124)
		self.size = kwargs.get("size", 1000)
		self.triple_pattern = re.compile(r"(\w+)>(\w+)>(\w+)")

	def __populate_triple__(self, subject_key, predicate_key, object_key):
		return {"s": self.db_tree.get(subject_key).decode(),
			"p": self.db_tree.get(predicate_key).decode(),
			"o": self.db_tree.get(object_key).decode()}
				
		
			

	def triples(self, **kwargs):
		results = []
		subject = kwargs.get("subject")
		predicate = kwargs.get("predicate")
		object_ = kwargs.get("object")
		if subject:
			subject_key = hashlib.sha1(
				str(subject).encode()).hexdigest()
			
		return results
		
	def __del__(self):
		self.db_tree.close()