# Build Notebook
Build Notebook for [Project 1](https://github.com/jermnelson/linked-data-fragments/projects/1)

## Example One: Colorado College Knowledge Graphs

In [8]:
import datetime, rdflib, pdb, sys, hashlib
from bplustree import BPlusTree, StrSerializer
sys.path.append("D:/2018/linked-data-fragments/cache/")
import btree
# Creating Colorado College Knowledge Graph
CC = rdflib.Graph()
def load_cc_scholarship():
	for path in ["D:/2018/tiger-catalog/KnowledgeGraph/colorado-college.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-people.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-2016-2017.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-2017-2018.ttl",
		 "D:/2018/cc-scholarship-graph/data/cc-research-statements.ttl",
		 "D:/2018/cc-scholarship-graph/data/cc-fast-subjects.ttl",
		 "D:/2018/cc-scholarship-graph/data/creative-works.ttl"]:
		CC.parse(path, format='turtle')
load_cc_scholarship()
print("Colorado College Number of Triples: {:,}".format(len(CC)))

Colorado College Number of Triples: 27,992


### Demo of BTree Linked Data Fragments 
Take subject, predicate, object triples from RDF Graph and create sha1 hashes of all Blank Nodes and IRI subjects and store in BTree. 

In [9]:
CC_TREE = BPlusTree("D:/2018/linked-data-fragments/tmp/cc-tree.db",
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

In [10]:
start = datetime.datetime.utcnow()
print("Starting loading Colorado College's {:,} triples at {} ".format(
    len(CC),
    start.isoformat()))
counter = 0
for subj, pred, obj in CC:
    subj_sha1 = hashlib.sha1(str(subj).encode()).hexdigest()
    pred_sha1 = hashlib.sha1(str(pred).encode()).hexdigest()
    obj_sha1 = hashlib.sha1(str(obj).encode()).hexdigest()
    # Add/Create Subject, Predicate, and Object
    if not subj_sha1 in CC_TREE:
        CC_TREE.insert(subj_sha1, str(subj).encode())
    if not pred_sha1 in CC_TREE:
        CC_TREE.insert(pred_sha1, str(pred).encode())
    if not obj_sha1 in CC_TREE:
        CC_TREE.insert(obj_sha1, str(obj).encode())
    btree.add_patterns(CC_TREE, subj_sha1, pred_sha1, obj_sha1)
    if not counter%10 and counter > 0:
        print(".", end="")
        CC_TREE.checkpoint()
    if not counter%1000:
        print("{:,}".format(counter), end="")
    counter += 1
        
end = datetime.datetime.utcnow()
CC_TREE.close()
print("Finished loading at {}, total time {:,} min for {:,} triples".format(
    end,
    (end-start).seconds / 60.0,
    len(CC)))

Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000.............................................................................

In [4]:
output = """Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000....................................................................................................9,000....................................................................................................10,000....................................................................................................11,000....................................................................................................12,000....................................................................................................13,000....................................................................................................14,000....................................................................................................15,000....................................................................................................16,000....................................................................................................17,000....................................................................................................18,000....................................................................................................19,000....................................................................................................20,000....................................................................................................21,000....................................................................................................22,000....................................................................................................23,000....................................................................................................24,000....................................................................................................25,000....................................................................................................26,000....................................................................................................27,000...................................................................................................Finished loading at 2018-05-21 22:18:58.331714, total time 10.916666666666666 min for 27,992 triples
"""
print(output)

Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000.............................................................................

In [5]:
(end-start).seconds / 60.0


3.75

## Example Two: Unit Test for Local BTree 

In [5]:
import os, sys, rdflib
print(sys.platform)
print(os.path.abspath(os.curdir))

linux
/home/jpnelson/2018/linked-data-fragments/doc


In [7]:
from bplustree import BPlusTree, StrSerializer
TEST_TREE = BPlusTree("/home/jpnelson/2018/linked-data-fragments/tmp/test-tree.db",
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

In [8]:
prefix = """@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix relators: <http://id.loc.gov/vocabulary/relators/> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."""

sample_one = prefix + """

<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-18T01:49:24.602657" ;
            rdf:value "Generated by BIBCAT version 1.13.0 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <http://www.uwyo.edu/ahc/> ;
    bf:itemOf <https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .
    
<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> a bf:Instance ;
    rdfs:label "Granite Bluff, 1900" ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg> ] ;
    bf:instanceOf <https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> ;
    bf:subject [ a bf:Topic ;
            rdf:value "Knight, Samuel H., (Samuel Howell), 1892-1975" ],
        [ a bf:Topic ;
            rdf:value "Geology" ],
        [ a bf:Topic ;
            rdf:value "Wyoming--History--1890-" ] ;
    bf:title [ a bf:Title ;
            rdf:value "Granite Bluff, 1900" ] .

<https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> a bf:Work ;
    bf:partOf <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~148~148>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~22~22>,
        <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~96~96> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Photograph of Granite Bluff, below and east of tunnel, October 1900" ] ;
    bf:temporalCoverage "1900" .

    """

sample_two = prefix + """<http://cdm16079.contentdm.oclc.org:80/cdm/ref/collection/p15330coll22/id/32926> a bf:Item ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:heldBy <https://www.denverlibrary.org/> ;
    bf:itemOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> ;
    bf:usageAndAccessPolicy <http://rightsstatements.org/vocab/CNE/1.0/> .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008> a bf:Instance ;
    bf:carrier [ a bf:Carrier ;
            rdf:value "Photographic prints." ],
        [ a bf:Carrier ;
            rdf:value "Imaged." ],
        [ a bf:Carrier ;
            rdf:value "Image" ],
        [ a bf:Carrier ;
            rdf:value "Nitrate negatives." ] ;
    bf:coverArt [ a bf:CoverArt ;
            rdf:value <http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926> ] ;
    bf:extent [ a bf:Extent ;
            rdf:value "1 photonegative : nitrate ; 15 x 10 cm. (6 x 4 in.); 1 photoprint ; 15 x 10 cm. (6 x 3 1/2 in.)"^^xsd:string ] ;
    bf:generationProcess [ a bf:GenerationProcess ;
            bf:generationDate "2017-08-03T04:14:32.360747" ;
            rdf:value "Generated by BIBCAT version 1.10.2 from KnowledgeLinks.io"^^xsd:string ] ;
    bf:identifiedBy [ a bf:oclc ;
            rdf:value "40811329"^^xsd:string ],
        [ a bf:Local ;
            rdf:value "X-33358"^^xsd:string ] ;
    bf:instanceOf <https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> ;
    bf:media [ a bf:Media ;
            rdf:value "Photograph"^^xsd:string ] ;
    bf:note [ a bf:Note ;
            bf:noteType "admin"@en ] ;
    bf:partOf <https://plains2peaks.org/wh231-joseph-g-masters-papers> ;
    bf:summary [ a bf:Summary ;
            rdf:value "Outdoor seated portrait of the wife of Wooden Leg, a seventy-eight year old Native American Northern Cheyenne woman, wearing a long cotton dress and braids. She was eighteen and in camp at time of the Little Bighorn."^^xsd:string ] ;
    bf:title [ a bf:Title ;
            bf:mainTitle "Mrs. Wooden Leg, of Manderson, 18 yrs. old & in camp at time of Custer Battle"^^xsd:string ] .

<https://plains2peaks.org/40633c2c-7802-11e7-b0f9-005056c00008#Work> a bf:StillImage,
        bf:Work ;
    bf:changeDate "12/23/2010" ;
    bf:contribution [ a bf:Contribution ;
            bf:role relators:cre ;
            rdf:value "Masters, Joseph G., 1873-" ] ;
    bf:subject [ a bf:Temporal ;
            rdf:value "1936" ],
        <https://plains2peaks.org/agent/wooden-leg-mrs>,
        <https://plains2peaks.org/topic/aged-persons-1930-1940>,
        <https://plains2peaks.org/topic/cheyenne-indians-1930-1940>,
        <https://plains2peaks.org/topic/indians-of-north-america-1930-1940>,
        <https://plains2peaks.org/topic/little-bighorn-battle-of-the-mont-1876-veterans>,
        <https://plains2peaks.org/topic/women-1930-1940>,
        <https://plains2peaks.org/topic/wooden-leg-1858-family> . """

In [9]:
example1, example2 = rdflib.Graph(), rdflib.Graph()
example1.parse(data=sample_one, format='turtle')
example2.parse(data=sample_two, format='turtle')

<Graph identifier=Nae82ab66ed5a425d842f9969b3dfad6b (<class 'rdflib.graph.Graph'>)>

![Example One Thumbnail](http://digitalcollections.uwyo.edu/MediaManager/srvr?mediafile=/Size2/uwydbuwy-22-NA/1883/ah400044_00354.jpg)

![Example_Two Thumbnail](http://digital.denverlibrary.org/utils/getthumbnail/collection/p15330coll22/id/32926)

In [10]:
print(example1.serialize(format='nt').decode())

<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236> <http://id.loc.gov/ontologies/bibframe/itemOf> <https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> .
<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> <http://www.w3.org/2000/01/rdf-schema#label> "Granite Bluff, 1900" .
_:ub1bL29C14 <http://www.w3.org/1999/02/22-rdf-syntax-ns#value> "Granite Bluff, 1900" .
<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236> <http://id.loc.gov/ontologies/bibframe/usageAndAccessPolicy> <http://rightsstatements.org/vocab/CNE/1.0/> .
_:ub1bL27C9 <http://www.w3.org/1999/02/22-rdf-syntax-ns#value> "Wyoming--History--1890-" .
<https://plains2peaks.org/765b4eac-83b7-11e7-a4e6-ac87a3129ce6#Work> <http://id.loc.gov/ontologies/bibframe/partOf> <http://digitalcollections.uwyo.edu/luna/servlet/uwydbuwy~96~96> .
<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6> <http://id.loc.gov/ontologies/bibframe/subject> 

In [17]:
import hashlib, pickle
for row in example1.serialize(format='nt').decode().splitlines()[0:10]:
    fields = row.split()
    print(fields[0:-1])

['<http://digitalcollections.uwyo.edu/luna/servlet/detail/uwydbuwy~22~22~555818~187236>', '<http://id.loc.gov/ontologies/bibframe/itemOf>', '<https://plains2peaks.org/765b26e8-83b7-11e7-987f-ac87a3129ce6>']


In [None]:
def add_key(tree, value):
    if not isinstance(value, bytes):
        value = value.encode()
    key = hashlib.sha1(value).hexdigest()
    if not key in tree:
        tree.insert(key, value)
    return key

def add_collections(tree, 
                 subject_sha1, 
                 predicate_sha1,
                 object_sha1):
    p_o_key = "{}:{}".format(predicate_sha1, object_sha1)
    s_p_key = "{}:{}".format(subject_sha1, predicate_sha1)
    for key in [p_o_key, s_p_key]:
        entities = []
        if key in tree:
            entities = pickle.loads(
                tree.get(key))
        
    
    
    
        
def quick_cache(tree, fields):
    subject_sha1 = add_key(tree, fields[0]) 
    predicate_sha1 = add_key(tree, fields[1])
    object_sha1 = add_key(tree, fields[2])
    subject_predicate = "{}:{}".format(subject_sha1,
                                      predicate_sha1)
    objects = []
    if subject_predicate in tree:
        objects = pickle.loads(tree.get(subject_predicate))
    objects.append(object_sha1)
    tree[subject_predicate] = pickle.dumps(list(set(objects)))
    wildcard_predicate = "*:{}".format(predicate_sha1)
    if wildcard_predicate in tree:
        wildcard_subjects
                               
    
    
    