# Build Notebook
Build Notebook for [Project 1](https://github.com/jermnelson/linked-data-fragments/projects/1)

In [1]:
# Setup env
import datetime, rdflib, pdb, sys, hashlib, requests
import os, sys, rdflib, uuid
from bplustree import BPlusTree, StrSerializer
NOTEBOOK_DIR = os.path.split(os.path.abspath(os.curdir))[0]
PROJECT_DIR = os.path.split(NOTEBOOK_DIR)[0]
BASE_DIR = os.path.split(PROJECT_DIR)[0]
CACHE_DIR = os.path.join(PROJECT_DIR, "cache")
sys.path.append(CACHE_DIR)
import btree
print(datetime.datetime.now(), sys.platform, os.path.os.name, BASE_DIR, PROJECT_DIR)

2018-05-28 08:19:31.122533 linux posix /home/jpnelson/2018 /home/jpnelson/2018/linked-data-fragments


## Example One: Colorado College Knowledge Graphs

To run this example, **BASE_DIR** should be the root directory where you have cloned the following git repositories:

1.  `git clone https://github.com/jermnelson/linked-data-fragments.git`
1.  `git clone https://github.com/Tutt-Library/tiger-catalog.git`
1.  `git clone https://github.com/Tutt-Library/cc-scholarship-graph.git`


In [None]:

# Creating Colorado College Knowledge Graph
CC = rdflib.Graph()
def load_cc_scholarship():
	for path in ["/tiger-catalog/KnowledgeGraph/colorado-college.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-people.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-2016-2017.ttl",
		 "/tiger-catalog/KnowledgeGraph/cc-2017-2018.ttl",
		 "/cc-scholarship-graph/data/cc-research-statements.ttl",
		 "/cc-scholarship-graph/data/cc-fast-subjects.ttl",
		 "/cc-scholarship-graph/data/creative-works.ttl"]:
		full_path = "{}{}".format(BASE_DIR, path)        
		CC.parse(full_path, format='turtle')
load_cc_scholarship()
print("Colorado College Number of Triples: {:,}".format(len(CC)))

### Demo of BTree Linked Data Fragments 
Take subject, predicate, object triples from RDF Graph and create sha1 hashes of all Blank Nodes and IRI subjects and store in BTree. 

In [None]:
CC_TREE = BPlusTree("{}/tmp/cc-tree.db".format(PROJECT_DIR),
                   serializer=StrSerializer(),
                   order=25,
                   key_size=124)

In [None]:
def build_strategy_one():
    start = datetime.datetime.utcnow()
    print("Starting loading Colorado College's {:,} triples at {} ".format(
        len(CC),
        start.isoformat()))
    counter = 0
    for subj, pred, obj in CC:
        subj_sha1 = btree.add_entity(CC_TREE, subj)
        pred_sha1 = btree.add_entity(CC_TREE, pred)
        obj_sha1 = btree.add_entity(CC_TREE, obj)
        btree.add_patterns(CC_TREE, subj_sha1, pred_sha1, obj_sha1)
        if not counter%100 and counter > 0:
            print(".", end="")
        if not counter%1000:
            print("{:,}".format(counter), end="")
            CC_TREE.checkpoint()
        counter += 1

    end = datetime.datetime.utcnow()
    CC_TREE.close()
    print("Finished loading at {}, total time {:,} min for {:,} triples".format(
        end,
        (end-start).seconds / 60.0,
        len(CC)))

In [None]:
build_strategy_one()

In [None]:
entity_tree_path = os.path.join(PROJECT_DIR, "tmp/cc-entity-tree.db")
ENTITY_TREE = BPlusTree(
    entity_tree_path,
    order=10,
    serializer=StrSerializer(),
    key_size=40)
triples_tree_path = os.path.join(PROJECT_DIR, "tmp/cc-triples-tree.db")


In [None]:
TRIPLES_TREE = BPlusTree(
    triples_tree_path,
    order=15,
    serializer=StrSerializer(),
    key_size=124
)
def build_test_two(graph):
    start = datetime.datetime.utcnow()
    authority = 'http://catalog.coloradocollege.edu/'
    count = 0
    print("Starting Build Test Two at {} for {:,} Triples in Graph".format(
        start,
        len(graph)))
    for subj, pred, obj in graph:
        if isinstance(subj, rdflib.BNode):
            subj = subj.skolemize(authority=authority)
        if isinstance(obj, rdflib.BNode):
            obj = obj.skolemize(authority=authority) 
        subject_sha1 = btree.add_entity(ENTITY_TREE, subj)
        predicate_sha1 = btree.add_entity(ENTITY_TREE, pred)
        object_sha1 = btree.add_entity(ENTITY_TREE, obj)
        btree.add_patterns(TRIPLES_TREE, 
                           subject_sha1,
                           predicate_sha1,
                           object_sha1)
        if not count%100 and count > 0:
            print(".", end="")
        if not count%1000 and count > 0:
            ENTITY_TREE.checkpoint()
            TRIPLES_TREE.checkpoint()
            print("{:,}".format(count), end="")
        count += 1
    end = datetime.datetime.utcnow()
    ENTITY_TREE.checkpoint()
    TRIPLES_TREE.checkpoint()    
    entity_db_size = os.path.getsize(entity_tree_path)
    triples_db_size = os.path.getsize(triples_tree_path)
    print("""Finished at {}, total time {:,} minutes for {:,} triples.
        {:,} Entities in Btree, {:,} bytes for {}
        {:,} Triples in Btree, {:,} bytes for {}
    """.format(
        end,
        (end-start).seconds / 60.0,
        len(graph),
        len(ENTITY_TREE),
        entity_db_size,
        entity_tree_path,
        len(TRIPLES_TREE),
        triples_db_size,
        triples_tree_path))
    

In [None]:
build_test_two(CC)

In [None]:
TRIPLES_TREE = BPlusTree(
    triples_tree_path,
    order=15,
    serializer=StrSerializer(),
    key_size=32
)
def build_test_three(graph):
    start = datetime.datetime.utcnow()
    authority = 'http://catalog.coloradocollege.edu/'
    count = 0
    print("Starting Build Test Three at {} for {:,} Triples in Graph".format(
        start,
        len(graph)))
    for subj, pred, obj in graph:
        if isinstance(subj, rdflib.BNode):
            subj = subj.skolemize(authority=authority)
        if isinstance(obj, rdflib.BNode):
            obj = obj.skolemize(authority=authority)
        subject_sha1 = btree.add_entity(ENTITY_TREE, subj, 10)
        predicate_sha1 = btree.add_entity(ENTITY_TREE, pred, 10)
        object_sha1 = btree.add_entity(ENTITY_TREE, obj, 10)
        btree.add_patterns(TRIPLES_TREE, 
                           subject_sha1,
                           predicate_sha1,
                           object_sha1,
                           10)
        if not count%100 and count > 0:
            print(".", end="")
        if not count%1000 and count > 0:
            ENTITY_TREE.checkpoint()
            TRIPLES_TREE.checkpoint()
            print("{:,}".format(count), end="")
        count += 1
    end = datetime.datetime.utcnow()
    ENTITY_TREE.checkpoint()
    TRIPLES_TREE.checkpoint()    
    entity_db_size = os.path.getsize(entity_tree_path)
    triples_db_size = os.path.getsize(triples_tree_path)
    print("""Finished at {}, total time {:,} minutes for {:,} triples.
        {:,} Entities in Btree, {:,} bytes for {}
        {:,} Triples in Btree, {:,} bytes for {}
    """.format(
        end,
        (end-start).seconds / 60.0,
        len(graph),
        len(ENTITY_TREE),
        entity_db_size,
        entity_tree_path,
        len(TRIPLES_TREE),
        triples_db_size,
        triples_tree_path))

In [None]:
build_test_three(CC)

## Example Two: Local BTree Strategies

In [2]:
from bplustree import BPlusTree, StrSerializer
TEST_TREE = BPlusTree("/home/jpnelson/2018/linked-data-fragments/tmp/test-tree.db",
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

In [3]:
prefix = """@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix relators: <http://id.loc.gov/vocabulary/relators/> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."""

sample_one = prefix + """

<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-18T01:49:24.602657" ;
            rdf:value "Generated by BIBCAT version 1.13.0 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <http://www.uwyo.edu/ahc/> ;
    bf:itemOf <https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .
    
<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> a bf:Instance ;
    rdfs:label "Granite Bluff, 1900" ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg> ] ;
    bf:instanceOf <https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> ;
    bf:subject [ a bf:Topic ;
            rdf:value "Knight, Samuel H., (Samuel Howell), 1892-1975" ],
        [ a bf:Topic ;
            rdf:value "Geology" ],
        [ a bf:Topic ;
            rdf:value "Wyoming--History--1890-" ] ;
    bf:title [ a bf:Title ;
            rdf:value "Granite Bluff, 1900" ] .

<https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> a bf:Work ;
    bf:partOf <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~148~148>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~22~22>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~96~96> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Photograph of Granite Bluff, below and east of tunnel, October 1900" ] ;
    bf:temporalCoverage "1900" .

    """

sample_two = prefix + """<http://cdm16079.contentdm.oclc.org:80/cdm/ref/collection/p15330coll22/id/32926> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <https://www.denverlibrary.org/> ;
    bf:itemOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> a bf:Instance ;
    bf:carrier [ a bf:Carrier ;
            rdf:value "Photographic prints." ],
        [ a bf:Carrier ;
            rdf:value "Imaged." ],
        [ a bf:Carrier ;
            rdf:value "Image" ],
        [ a bf:Carrier ;
            rdf:value "Nitrate negatives." ] ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926> ] ;
    bf:extent [ a bf:Extent ;
            rdf:value "1 photonegative : nitrate ; 15 x 10 cm. (6 x 4 in.); 1 photoprint ; 15 x 10 cm. (6 x 3 1/2 in.)"^^xsd:string ] ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:identifiedBy [ a bf:oclc ;
            rdf:value "40811329"^^xsd:string ],
        [ a bf:Local ;
            rdf:value "X-33358"^^xsd:string ] ;
    bf:instanceOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> ;
    bf:media [ a bf:Media ;
            rdf:value "Photograph"^^xsd:string ] ;
    bf:note [ a bf:Note ;
            bf:noteType "admin"@en ] ;
    bf:partOf <https://plains2peaks.org/wh231-joseph-g-masters-papers> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Outdoor seated portrait of the wife of Wooden Leg, a seventy-eight year old Native American Northern Cheyenne woman, wearing a long cotton dress and braids. She was eighteen and in camp at time of the Little Bighorn."^^xsd:string ] ;
    bf:title [ a bf:Title ;
            bf:mainTitle "Mrs. Wooden Leg, of Manderson, 18 yrs. old & in camp at time of Custer Battle"^^xsd:string ] .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> a bf:StillImage,
        bf:Work ;
    bf:changeDate "12/23/2010" ;
    bf:contribution [ a bf:Contribution ;
            bf:role relators:cre ;
            rdf:value "Masters, Joseph G., 1873-" ] ;
    bf:subject [ a bf:Temporal ;
            rdf:value "1936" ],
        <https://plains2peaks.org/agent/wooden-leg-mrs>,
        <https://plains2peaks.org/topic/aged-persons-1930-1940>,
        <https://plains2peaks.org/topic/cheyenne-indians-1930-1940>,
        <https://plains2peaks.org/topic/indians-of-north-america-1930-1940>,
        <https://plains2peaks.org/topic/little-bighorn-battle-of-the-mont-1876-veterans>,
        <https://plains2peaks.org/topic/women-1930-1940>,
        <https://plains2peaks.org/topic/wooden-leg-1858-family> . """




In [4]:
example1, example2 = rdflib.Graph(), rdflib.Graph()
example1.parse(data=sample_one, format='turtle')
example2.parse(data=sample_two, format='turtle')

<Graph identifier=N484d3a4a6cba41d69c3a1408499bb98d (<class 'rdflib.graph.Graph'>)>

In [None]:
example1.authority='http://catalog.coloradocollege.edu/'
new_example1 = example1.skolemize()

In [None]:
print(new_example1.serialize(format='turtle').decode())

In [5]:
# Tool chain for converting MODS to BIBFRAME Metadata
item_url="https://digitalcc.coloradocollege.edu/islandora/object/coccc:29797/"
mods_url = "{}datastream/MODS".format(item_url)
mods_result = requests.get(mods_url)
example3 = rdflib.Graph()
from bibcat.rml import processor
BASE_URL = 'https://tiger.coloradocollege.edu/'
if sys.platform.startswith("win"):
    cc_rule = 'D:/2018/dpla-service-hub/profiles/colorado-college.ttl'
elif sys.platform.startswith("linux"):
    cc_rule = "/home/jpnelson/2018/dpla-service-hub/profiles/colorado-college.ttl"
mods2bf = processor.XMLProcessor(
    rml_rules=['mods-to-bf.ttl', 
               'bibcat-base.ttl',
               cc_rule],
    namespaces={"mods": "http://www.loc.gov/mods/v3",
                    "xlink": "https://www.w3.org/1999/xlink"},
    base_url=BASE_URL,
    triplestore_url='http://localhost:9999/blazegraph/sparql')
instance_iri = rdflib.URIRef("{}{}".format(BASE_URL, uuid.uuid1()))
work_iri = rdflib.URIRef("{}#Work".format(instance_iri))
mods2bf.run(xml=mods_result.text,
            item_iri=rdflib.URIRef(item_url),
            instance_iri=instance_iri,
            work_iri=work_iri)
example3 = mods2bf.output

In [None]:
print(example3.serialize(format='turtle').decode())

![Example One Thumbnail](http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg)

![Example_Two Thumbnail](http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926)

In [6]:
# Save Examples to BTree
EXAMPLES_TREE = BPlusTree("{}/data/example-tree.db".format(PROJECT_DIR),
                          serializer=StrSerializer(),
                          order=25,
                          key_size=124)

def load_examples(tree):
    start = datetime.datetime.utcnow()
    print("Starting at {}".format(start))
    total = 0
    for row in [example1, example2, example3]:
        for s,p,o in row:
            total += 1
            is_ingested = btree.add_patterns(
                tree,
                btree.add_entity(tree, s),
                btree.add_entity(tree, p),
                btree.add_entity(tree, o)
            )
    end = datetime.datetime.utcnow()
    print("Finished loading {:,} triples at {}, total time={:,}".format(
        total,
        end,
        (end-start).seconds / 60.0))


In [7]:
print("Start size {:,}".format(len(EXAMPLES_TREE)))
load_examples(EXAMPLES_TREE)
print("End size {:,}".format(len(EXAMPLES_TREE)))
EXAMPLES_TREE.close()

Start size 1,700
Starting at 2018-05-28 14:20:27.918495
Finished loading 170 triples at 2018-05-28 14:20:28.768940, total time=0.0
End size 2,043


## BTree Scaling
First, we'll create the [Plains2Peaks.org](https://plains2peaks.org/) pilot dataset for testing purposes.

The Pilot RDF dataset is available on the Pilot's Github 

In [None]:
P2P_TREE = BPlusTree("{}/tmp/p2p-tree.db".format(PROJECT_DIR),
                   serializer=StrSerializer(),
                   order=25,
                   key_size=124)


In [None]:
P2P_PATH = os.path.join(BASE_DIR, "Plains2PeaksPilot/data")

def load_p2p(tree):
    start = datetime.datetime.utcnow()
    print("Starting Load Plains2Peaks into Linked Fragments Server at {}".format(
        start))
    for row in os.walk(P2P_PATH):
        walk_result = row
        break
    triple_count = 0
    for filename in walk_result[-1]:
        g = rdflib.Graph()
        g.parse(os.path.join(P2P_PATH, filename), format='turtle')
        print("""\tFinished parsing {}, total triples: {:,}. Starting 
    ingestion into BTree""".format(filename,
            len(g)))
        for sub, pred, obj in g:
            is_ingested = btree.add_patterns(tree,
                btree.add_entity(tree,sub),
                btree.add_entity(tree, pred),
                btree.add_entity(tree, obj))
            if not is_ingested:
                print("F{:,}".format(triple_count), end="")
            if not triple_count%100 and triple_count > 0:
                print(".", end="")
            if not triple_count%1000:
                tree.checkpoint()
                print("{:,}".format(triple_count), end="")
            triple_count += 1
    end = datetime.datetime.utcnow()
    tree_size = os.path.getsize("{}/tmp/p2p-tree.db".format(PROJECT_DIR))
    print("""Finished at {}, total time {:,} minutes for {:,} triples.
Final btree file size is {:,}k""".format(
    end,
    (end-start).seconds / 60.0,
    triple_count,
    tree_size))
            
                                             
load_p2p(P2P_TREE)    
    
                               
    
    
    

In [None]:
for row in os.walk("D:/2018/Plains2PeaksPilot/data"):
    print(row)