In [25]:
from typing import Dict
from collections import defaultdict
import json
import os
from pprint import pprint

from py2neo import GraphService, Graph, Node, Relationship, Subgraph

import graph_builder as gb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
G = Graph("bolt://localhost:7687")
# username = "neo4j"
# password = "KrRsKLC26qhHSdj4PG_qRso98GdyDbZOTQrdvvZPr8Q"
# G = Graph(f"neo4j+s://{username}:{password}@f2c1d80b.databases.neo4j.io")

In [27]:
G.delete_all()

In [28]:
with open("data/scraped/classes.json") as f:
    classes = json.load(f)
    
with open("data/scraped/programs.json") as f:
    programs = json.load(f)
    
with open("data/scraped/specialisations.json") as f:
    special = json.load(f)

In [29]:
print(len(classes), len(programs), len(special))

3142 294 449


In [30]:
G.schema.create_uniqueness_constraint('class', 'id')
G.schema.create_uniqueness_constraint('program', 'id')
G.schema.create_uniqueness_constraint('specialisation', 'id')
# G.schema.create_uniqueness_constraint('requirement', 'id')

ClientError: [Schema.EquivalentSchemaRuleAlreadyExists] An equivalent constraint already exists, 'Constraint( id=2, name='constraint_2f5dcb8a', type='UNIQUENESS', schema=(:class {id}), ownedIndex=1 )'.

## Create Nodes

In [31]:
G.run("""
MATCH (n)
RETURN count(*)
""")

count(*)
0


In [32]:
%%time
items = []
for doc in classes:
    items += gb.create_node_if_not_exists(gb.CLASSES, doc, G, doc['id'], 'class'),
print(len(gb.CLASSES))
print(len(items))

3142
3142
CPU times: user 7.63 s, sys: 675 ms, total: 8.3 s
Wall time: 1min 29s


In [33]:
%%time
items = []
for doc in programs:
    items += gb.create_node_if_not_exists(gb.PROGRAMS, doc, G, doc['id'], 'program'),
print(len(gb.PROGRAMS))
print(len(items))

294
294
CPU times: user 678 ms, sys: 55.6 ms, total: 734 ms
Wall time: 7.8 s


In [34]:
%%time
items = []
for doc in special:
    items += gb.create_node_if_not_exists(gb.SPECIAL, doc, G, doc['id'], 'specialisation'),
print(len(gb.SPECIAL))
print(len(items))

449
449
CPU times: user 1.04 s, sys: 84.6 ms, total: 1.12 s
Wall time: 12 s


In [35]:
G.run("""
MATCH (n)
RETURN count(*)
""")

count(*)
3885


## Create Edges

In [36]:
%%time
## create edges for class requisites
items = []
for doc in classes:
    if 'requisites' in doc:
        items.extend(gb.create_nodes_and_edges_if_class_requisite(doc['requisites'], gb.CLASSES[doc['id']], G))

CPU times: user 25.2 s, sys: 2.73 s, total: 28 s
Wall time: 4min 1s


In [37]:
%%time
items2 = []
for doc in programs:
    src_node = gb.PROGRAMS[doc['id']]
    for requirement in doc['requirements']:
        items2.extend(gb.create_nodes_and_edges_if_program(requirement, src_node, G))

CPU times: user 30.9 s, sys: 3.14 s, total: 34 s
Wall time: 5min 22s


In [38]:
%%time
items3 = []
for doc in special:
    src_node = gb.SPECIAL[doc['id']]
    for requirement in doc['requirements']:
        items3.extend(gb.create_nodes_and_edges_if_program(requirement, src_node, G))

CPU times: user 24 s, sys: 2.46 s, total: 26.4 s
Wall time: 4min 13s


In [39]:
G.run("""
MATCH (n)-[]-()
RETURN count(*)
""")

count(*)
47990


In [40]:
gb.PROGRAMS.keys()

dict_keys(['HACCT', 'BADAN', 'HACTS', 'AACOM', 'BACCT', 'HAHCR', 'BACTS', 'BAHCR', 'HADAN', 'AACRD', 'BAPRC', 'HAPRC', 'HART2', 'BAPAF', 'HASIA', 'BARTS', 'HARTS', 'BARTY', 'BAPAR', 'BASIA', 'BASY', 'BBIOT', 'HBIOT', 'BBUSA', 'HBUSA', 'BCLAS', 'HCLAS', 'BCOMM', 'HCOMM', 'BCRIM', 'HCRIM', 'HDESN', 'BDESN', 'HDEVS', 'HECON', 'BDEVY', 'AENGI', 'BECON', 'BDEVS', 'AENRD', 'HENSU', 'BENSU', 'AENSU', 'HENVS', 'HEURO', 'BFINN', 'HFINN', 'AFEST', 'BGENE', 'BIT', 'HIT', 'HGENE', 'BINBS', 'BHLTH', 'HHLTH', 'HINBS', 'BIR', 'HIR', 'BLANY', 'BIRY', 'BINSS', 'HINSS', 'HLANG', 'BINSY', 'BLANG', 'ALLB', 'BMASC', 'HMASC', 'BMEDS', 'HMEDS', 'BMECA', 'HMECA', 'BMUSI', 'HMUSI', 'BPAST', 'HPAST', 'APHSC', 'APNSC', 'APNSN', 'AHUSS', 'HPOLS', 'BPLSC', 'HPLSC', 'BPPE', 'HPPE', 'APSYC', 'BPPOL', 'HPPOL', 'BSC', 'ASCAD', 'HSC', 'BSPSY', 'HSPSY', 'BSCY', 'ASSAN', 'ASSAE', 'HSTAT', 'BSTUD', 'HVART', 'ELANG', '4050FDD', 'BSTAT', 'BVART', '4750FDD', '4350FDD', '8950XMCHD', 'MEMPA', 'MEMPP', '6459XGCACC', '7050FDM', 

In [41]:
ret = G.nodes.match('program', name='Master of Computing')
ret

<py2neo.matching.NodeMatch at 0x114fb88b0>

In [42]:
node = ret.first()
node

Node('program', id='7706XMCOMP', n_units=96, name='Master of Computing')

In [43]:
ret = G.relationships.match([node])
list(ret)

[Requirement(Node('program', id='7706XMCOMP', n_units=96, name='Master of Computing'), Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), description='The 96 units must consist of:'),
 Requirement(Node('program', id='7706XMCOMP', n_units=96, name='Master of Computing'), Node('requirement', description='The Master of Computing requires the completion of 96 units, of which:', id='318898639986', n_units=96), description='The Master of Computing requires the completion of 96 units, of which:')]

In [44]:
edges = []
for edge in ret:
    edges.extend(list(G.relationships.match([edge.end_node])))

In [45]:
edges

[Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='12 units from completion of elective courses offered by ANU.', id='317415999431', n_units=12), description='12 units from completion of elective courses offered by ANU.'),
 Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='18 units from completion of further courses from the subject area COMP Computer Science, or non-COMP courses included in specialisation lists..', id='281781585513', n_units=18), description='18 units from completion of further courses from the subject area COMP Computer Science, or non-COMP courses included in specialisation lists..'),
 Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='24 units from the completion of one of the following Speciali

In [46]:
items = []
for edge in edges:
    items.extend(list(G.relationships.match([edge.start_node])))

In [47]:
items

[Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='12 units from completion of elective courses offered by ANU.', id='317415999431', n_units=12), description='12 units from completion of elective courses offered by ANU.'),
 Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='18 units from completion of further courses from the subject area COMP Computer Science, or non-COMP courses included in specialisation lists..', id='281781585513', n_units=18), description='18 units from completion of further courses from the subject area COMP Computer Science, or non-COMP courses included in specialisation lists..'),
 Requirement(Node('requirement', description='The 96 units must consist of:', id='246818236269', n_units=96), Node('requirement', description='24 units from the completion of one of the following Speciali