# Build Notebook
Build Notebook for [Project 1](https://github.com/jermnelson/linked-data-fragments/projects/1)

In [8]:
import datetime, rdflib, pdb, sys, hashlib
from bplustree import BPlusTree, StrSerializer
sys.path.append("D:/2018/linked-data-fragments/cache/")
import btree
# Creating Colorado College Knowledge Graph
CC = rdflib.Graph()
def load_cc_scholarship():
	for path in ["D:/2018/tiger-catalog/KnowledgeGraph/colorado-college.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-people.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-2016-2017.ttl",
		 "D:/2018/tiger-catalog/KnowledgeGraph/cc-2017-2018.ttl",
		 "D:/2018/cc-scholarship-graph/data/cc-research-statements.ttl",
		 "D:/2018/cc-scholarship-graph/data/cc-fast-subjects.ttl",
		 "D:/2018/cc-scholarship-graph/data/creative-works.ttl"]:
		CC.parse(path, format='turtle')
load_cc_scholarship()
print("Colorado College Number of Triples: {:,}".format(len(CC)))

Colorado College Number of Triples: 27,992


### Demo of BTree Linked Data Fragments 
Take subject, predicate, object triples from RDF Graph and create sha1 hashes of all Blank Nodes and IRI subjects and store in BTree. 

In [9]:
CC_TREE = BPlusTree("D:/2018/linked-data-fragments/tmp/cc-tree.db",
                   serializer=StrSerializer(),
                   order=10,
                   key_size=124)

In [10]:
start = datetime.datetime.utcnow()
print("Starting loading Colorado College's {:,} triples at {} ".format(
    len(CC),
    start.isoformat()))
counter = 0
for subj, pred, obj in CC:
    subj_sha1 = hashlib.sha1(str(subj).encode()).hexdigest()
    pred_sha1 = hashlib.sha1(str(pred).encode()).hexdigest()
    obj_sha1 = hashlib.sha1(str(obj).encode()).hexdigest()
    # Add/Create Subject, Predicate, and Object
    if not subj_sha1 in CC_TREE:
        CC_TREE.insert(subj_sha1, str(subj).encode())
    if not pred_sha1 in CC_TREE:
        CC_TREE.insert(pred_sha1, str(pred).encode())
    if not obj_sha1 in CC_TREE:
        CC_TREE.insert(obj_sha1, str(obj).encode())
    btree.add_patterns(CC_TREE, subj_sha1, pred_sha1, obj_sha1)
    if not counter%10 and counter > 0:
        print(".", end="")
        CC_TREE.checkpoint()
    if not counter%1000:
        print("{:,}".format(counter), end="")
    counter += 1
        
end = datetime.datetime.utcnow()
CC_TREE.close()
print("Finished loading at {}, total time {:,} min for {:,} triples".format(
    end,
    (end-start).seconds / 60.0,
    len(CC)))

Starting loading Colorado College's 27,992 triples at 2018-05-21T22:08:03.160067 
0....................................................................................................1,000....................................................................................................2,000....................................................................................................3,000....................................................................................................4,000....................................................................................................5,000....................................................................................................6,000....................................................................................................7,000....................................................................................................8,000.............................................................................

In [5]:
(end-start).seconds / 60.0


3.75