# Demo Data Harvester

In [None]:
import json
from src.amgraph import AMFGraph

### Read in Colour Interest Data

In [None]:
file_name= '../data/example_colours.json'
with open(file_name, 'r') as fp:
    raw_data = json.load(fp)

print('raw data record:', raw_data[:1])
print('nos records', len(raw_data))

### First sequence change occures on record 30

In [None]:
for record_id in range(27,35):
    print(record_id, raw_data[record_id])

### Second sequence change occures at record 110

In [None]:
for record_id in range(100,119):
    print(record_id, raw_data[record_id])

### Create a graph for first sequence 

In [None]:
sequence_1_graph = AMFGraph()
for idx in range(30):
    
    record = raw_data[idx]
    
    # each record represents an interest in colour so create a 'colour_interest' node
    # 
    node_id = ('colour_interest', str(record['record_id']))
    
    # each colour_interest node has four attribute nodes
    #
    node_attr = {('has_rgb', ('rgb', 'r')): {'prob': 1.0, 'numeric': record['r'], 'numeric_min': 0, 'numeric_max': 255.0},
                 ('has_rgb', ('rgb', 'g')): {'prob': 1.0, 'numeric': record['g'], 'numeric_min': 0, 'numeric_max': 255.0},
                 ('has_rgb', ('rgb', 'b')): {'prob': 1.0, 'numeric': record['b'], 'numeric_min': 0, 'numeric_max': 255.0},
                 ('has_label', ('colour', record['COLOUR'])): {'prob': 1.0},
                }
    sequence_1_graph.set_node(node=node_id, node_attr=node_attr)

# plot the 2 record
#
sequence_1_graph.plot(dimension=3, node_filter_func=lambda x: x[0] == 'colour_interest' and x[1]=='2')

### Now save this graph into an in-memory cache that will also persist to ArangoDB

First we must start the Dask Cluster and ensure Arangodb is running...

In [None]:
from src.distributed_cache import DistributedCache

# cache setup needs db credentials and Dask Cluster credential
#
config = {'db_name': 'AMF',
          'db_username': 'stephen',
          'db_password': 'kontexia.io',
          'db_system': 'arango_db',
          'db_config_file_path': '~/kontexia/dev/soam/src/databases_configuration.json',
          'db_queries_file_path':'~/kontexia/dev/soam/src/database_queries.json',
          'scheduler_address': 'localhost:8786'}

dc = DistributedCache(config=config)

### Save this graph into a store called 'interest_graphs' with a key called 'sequence_1'

In [None]:
future = dc.set_kv(store_name='interest_graphs', key='sequence_1', value=sequence_1_graph)

### We can now check that is it stored in our ArangoDB 




...




### And then we can start a new process to read the graph from the distributed cache


...



### Let's read in the next sequence

In [None]:
sequence_2_graph = AMFGraph()
for idx in range(30, 110):
    
    record = raw_data[idx]
    
    # each record represents an interest in colour so create a 'colour_interest' node
    # 
    node_id = ('colour_interest', str(record['record_id']))
    
    # each colour_interest node has four attribute nodes
    #
    node_attr = {('has_rgb', ('rgb', 'r')): {'prob': 1.0, 'numeric': record['r'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_rgb', ('rgb', 'g')): {'prob': 1.0, 'numeric': record['g'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_rgb', ('rgb', 'b')): {'prob': 1.0, 'numeric': record['b'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_label', ('colour', record['COLOUR'])): {'prob': 1.0},
                }
    sequence_2_graph.set_node(node=node_id, node_attr=node_attr)

# plot record 30
#
sequence_2_graph.plot(dimension=3, node_filter_func=lambda x: x[0] == 'colour_interest' and x[1]=='30')

### And store in the cache with a new key 'sequence_2'

In [None]:
future = dc.set_kv(store_name='interest_graphs', key='sequence_2', value=sequence_2_graph)

In [None]:
sequence_3_graph = AMFGraph()
for idx in range(110, len(raw_data)):
    
    record = raw_data[idx]
    
    # each record represents an interest in colour so create a 'colour_interest' node
    # 
    node_id = ('colour_interest', str(record['record_id']))
    
    # each colour_interest node has four attribute nodes
    #
    node_attr = {('has_rgb', ('rgb', 'r')): {'prob': 1.0, 'numeric': record['r'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_rgb', ('rgb', 'g')): {'prob': 1.0, 'numeric': record['g'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_rgb', ('rgb', 'b')): {'prob': 1.0, 'numeric': record['b'], 'numeric_min': 0, 'numeric_max': 1.0},
                 ('has_label', ('colour', record['COLOUR'])): {'prob': 1.0},
                }
    sequence_3_graph.set_node(node=node_id, node_attr=node_attr)

# plot record 110
#
sequence_3_graph.plot(dimension=3, node_filter_func=lambda x: x[0] == 'colour_interest' and x[1]=='110')

### And store in cache

In [None]:
future = dc.set_kv(store_name='interest_graphs', key='sequence_3', value=sequence_3_graph)