In [None]:
import os
import json
import collections
from itertools import chain

In [None]:
DATA_DIR = ''
assert os.path.isdir(DATA_DIR)

fname = 'links.json'
fpath = os.path.join(DATA_DIR, fname)
assert os.path.isfile(fpath)

In [None]:
with open(fpath, 'r') as f:
    data = json.load(f)

In [None]:
def collect_node_attr_values(data, node_id, attr_fields=None):
    node_attr_dict = dict()
    if attr_fields is None or not isinstance(attr_fields, list):
        raise ValueError('Must provide metadata attribute fields for node data curation.')
    for entity in data:
        node_id_val = entity.get(node_id)
        attrs = node_attr_dict.get(node_id_val, {})
        for field in attr_fields:
            values = attrs.get(field, [])
            new_value = entity.get(field)
            if not new_value is None:
                values.append(new_value)
            attrs[field] = list(set(values))
        node_attr_dict[node_id_val] = attrs
    return node_attr_dict


def reduce_node_attr_values(data, min_date_field=None, max_date_field=None, delimiter=',', attr_fields=None):
    node_lookup = dict()
    if attr_fields is None or not isinstance(attr_fields, list):
        raise ValueError('Must provide metadata attribute fields for node data curation.')
    for node_id, attrs in data.items():
        rec = dict()
        for field in attr_fields:
            values = attrs.get(field, [])
            if field == min_date_field:
                rec[field] = min(values)
                continue
            if field == max_date_field:
                rec[field] = max(values)
                continue
            if values:
                rec[field] = delimiter.join(values)
        node_lookup[node_id] = rec
    return node_lookup

In [None]:
def create_node_dict(data):
    attr_fields=['user_id', 'name', 'email', 'start_date']
    user_data = collect_node_attr_values(
        data,
        node_id='user_id',
        attr_fields=attr_fields
    )
    node_data = reduce_node_attr_values(
        user_data,
        min_date_field='start_date',
        attr_fields=attr_fields
    )
    shared_data = list(chain.from_iterable([d.get('shared_data', []) for d in data]))
    shared_node_data = None
    if shared_data:
        shared_attr_fields = ['method', 'value', 'first_activity_date', 'last_activity_date']
        shared_user_data = collect_node_attr_values(
            shared_data,
            node_id='value',
            attr_fields=shared_attr_fields
        )
        shared_node_data = reduce_node_attr_values(
            shared_user_data,
            min_date_field='first_activity_date',
            max_date_field='last_activity_date',
            attr_fields=shared_attr_fields
        )
    if not shared_node_data is None:
        node_data.update(shared_node_data)
    return node_data

In [None]:
def build_edges(data):
    all_edges = []
    for entity in data:
        v1 = entity.get('user_id')
        shared_data = entity.get('shared_data', [])
        for rec in shared_data:
            v2 = rec.get('value')
            if v2 is None:
                continue
            v3 = rec.get('user_id')
            if v3 is None:
                # expecting data to have the timestamp field here
                edges = [{'source': v1, 'target': v2, 'timestamp': rec['last_activity_date']}]
            else:
                edges = [
                    {'source': v1, 'target': v2, 'timestamp': rec['last_activity_date']},
                    {'source': v2, 'target': v3, 'timestamp': rec['last_activity_date']}
                ]
            all_edges.extend(edges)
    return all_edges


def dedupe_edges(edges):
    edge_set = set()
    deduped_edges = []
    for edge in edges:
        t1 = (edge['source'], edge['target'], edge['timestamp'])
        t2 = (edge['target'], edge['source'], edge['timestamp'])
        if t1 in edge_set:
            continue
        else:
            edge_set.add(t1)
            edge_set.add(t2)
            deduped_edges.append(edge)
    return deduped_edges


def apply_node_attributes(
    edges,
    node_dict={},
    color_dict={},
    labels=('source', 'target')
):
    edge_table = []
    for edge in edges:
        for label in labels:
            attrs = node_dict.get(edge[label], {})
            if attrs:
                edge[f"{label}_attrs"] = attrs
            edge[f"{label}_color"] = color_dict.get(edge[label], 'gray')
        edge_table.append(edge)
    return edge_table

In [None]:
# labeling_dict = collections.defaultdict(list)

# for edge in edges:
#     for label in ['source', 'target']:
#         id_value = edge[label]
#         labeling_dict[id_value].append(label)

# color_map = {'source & target': 'red', 'source': 'yellow', 'target': 'gray'}
# color_dict = dict(labeling_dict)

# for node_id in color_dict:
#     counts = collections.Counter(color_dict.get(node_id))
#     res = ' & '.join(sorted(list(dict(counts).keys())))
#     color_dict[node_id] = color_map.get(res, 'gray')

In [None]:
   ### METHODOLOGY ###

# 1. Build the edges from source data
# 2. De-duplicate edge data, reduce redundant info
# 3. Apply node attributes and/or color labels

In [None]:
edges = build_edges(data)

edge_table = apply_node_attributes(
    dedupe_edges(edges),
    node_dict=create_node_dict(data)
)

edge_table