## Feature generation

This is the core of COLOGNE where we generate discrete embeddings per node. Below we have implemented the three methods for feature generation: i) random walks, ii) minwise independent sampling and iii) L1 sampling. 
More details are provided as comments in the code below.

In [71]:
import os
import sys
import networkx as nx
import pandas as pd
import json
import random
import math
import copy
import time
from BinaryStream import BinaryStream
from TabulationHashing import TabulationHashing

In [72]:
graphnames = ['Cora', 'Citeseer', 'Pubmed', 'HomoSapiens', 'Wikipedia', 'BlogCatalog']
idx = 3
graphname = graphnames[idx]
create_features = False
if idx > 2:
    create_features = True
emb_size = 50 # how many samples per node to generate 
depth = 4 # the depth of the local neighborhood
data_dir = os.path.expanduser("../Graphs/"+graphname)

In [73]:
randompath = "../random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [74]:
nodedata_path = data_dir + "/data/nodedata.json" 
with open(nodedata_path, "r") as read_file:
    nodedata = json.load(read_file)

In [75]:
# an example of node information. The node label and a dictionary of words (with weights) describing the node 
nodedata[list(nodedata.keys())[129]]

['label=20=35=', {'129': 1}]

In [76]:
nodedata

{'0': ['Nonelabel=5=15=36=38=45=', {'0': 1}],
 '1': ['label=29=', {'1': 1}],
 '2': ['label=4=', {'2': 1}],
 '3': ['label=29=', {'3': 1}],
 '4': ['label=1=8=', {'4': 1}],
 '5': ['label=2=21=24=', {'5': 1}],
 '6': ['label=41=', {'6': 1}],
 '7': ['label=2=', {'7': 1}],
 '8': ['label=27=48=', {'8': 1}],
 '9': ['Nonelabel=26=', {'9': 1}],
 '10': ['label=9=', {'10': 1}],
 '11': ['label=45=', {'11': 1}],
 '12': ['label=26=33=', {'12': 1}],
 '13': ['label=26=33=', {'13': 1}],
 '14': ['label=27=', {'14': 1}],
 '15': ['label=5=33=', {'15': 1}],
 '16': ['label=20=', {'16': 1}],
 '17': ['label=31=39=', {'17': 1}],
 '18': ['label=23=25=26=29=', {'18': 1}],
 '19': ['label=26=', {'19': 1}],
 '20': ['Nonelabel=7=', {'20': 1}],
 '21': ['label=48=', {'21': 1}],
 '22': ['label=25=33=43=', {'22': 1}],
 '23': ['label=9=', {'23': 1}],
 '24': ['Nonelabel=39=48=', {'24': 1}],
 '25': ['Nonelabel=27=39=', {'25': 1}],
 '26': ['label=35=38=', {'26': 1}],
 '27': ['label=9=', {'27': 1}],
 '28': ['label=5=', {'28': 

In [77]:
def read_graph_from_edge_list(filename, nodedata):
    G = nx.Graph()
    path = data_dir + "/data/" + filename
    cnt = 0
    with open(path, 'r') as edgefile: # os.path.join(data_dir, filename),
        for line in edgefile:
            cnt += 1
            line_split = line.split(':')
            if len(line_split) > 1:
                l0 = line_split[0]
                l1 = line_split[1]
                u = l0.strip()
                v = l1.strip()
                if u in nodedata and v in nodedata:
                    G.add_edge(u, v)
        
    print(cnt)
    return G
    

In [78]:
G = read_graph_from_edge_list("all_graph_edges.txt", nodedata)

38739


In [79]:
G.number_of_nodes(), G.number_of_edges()

(3890, 38739)

In [80]:
H = read_graph_from_edge_list("graph_edges_reduced.txt", nodedata)

30991


In [81]:
H.number_of_nodes(), H.number_of_edges()

(3884, 30991)

In [82]:
R = read_graph_from_edge_list("removed_edges.txt", nodedata)

7748


In [83]:
R.number_of_nodes(), R.number_of_edges()

(2822, 7748)

In [84]:
def get_labels(label):
    if label[:4] == 'None':
        return []
    else:
        labels = label.split('=')
        return labels[1:len(labels)-1]

In [85]:
get_labels('label=23=25=26=29=')

['23', '25', '26', '29']

In [86]:
def generate_features(G, nodedata):
    new_nodedata = {}
    for u in G.nodes():
        labels_u = get_labels(nodedata[u][0])
        if len(labels_u) == 0:
            labels_u = ['none']
        new_nodedata[u] = [nodedata[u][0], {v: 1 for v in labels_u}]
#         for v in G.neighbors(u):
#             labels_v = get_labels(nodedata[v][0])
#             attributes = new_nodedata[u][1]
#             for lbl in labels_v:
#                 attributes.setdefault(lbl, 0)
#                 if lbl != 'none':
#                     attributes[lbl] += 1
#                 else:
#                     attributes[lbl] = 1
#     for u, vals in new_nodedata.items():
#         s = 0
#         for n, c in vals[1].items(): 
#             s += c
#         vals[1] = {v: c/s for v, c in vals[1].items()}
        # print(vals[1])
    return new_nodedata            

In [87]:
new_nodedata = generate_features(G, nodedata)

In [88]:
new_nodedata

{'0': ['Nonelabel=5=15=36=38=45=', {'none': 1.0}],
 '137': ['label=10=', {'10': 1.0}],
 '206': ['label=29=', {'29': 1.0}],
 '243': ['label=21=', {'21': 1.0}],
 '954': ['label=20=', {'20': 1.0}],
 '974': ['label=31=', {'31': 1.0}],
 '1552': ['label=40=', {'40': 1.0}],
 '1586': ['label=5=', {'5': 1.0}],
 '1839': ['label=22=27=35=38=',
  {'22': 0.25, '27': 0.25, '35': 0.25, '38': 0.25}],
 '1903': ['label=39=', {'39': 1.0}],
 '2137': ['label=8=', {'8': 1.0}],
 '2158': ['Nonelabel=36=', {'none': 1.0}],
 '2367': ['label=16=', {'16': 1.0}],
 '3006': ['label=22=42=', {'22': 0.5, '42': 0.5}],
 '3225': ['Nonelabel=41=', {'none': 1.0}],
 '3361': ['label=31=39=48=',
  {'31': 0.3333333333333333,
   '39': 0.3333333333333333,
   '48': 0.3333333333333333}],
 '3405': ['label=25=', {'25': 1.0}],
 '3583': ['label=25=43=', {'25': 0.5, '43': 0.5}],
 '3644': ['label=30=', {'30': 1.0}],
 '4': ['label=1=8=', {'1': 0.5, '8': 0.5}],
 '7': ['label=2=', {'2': 1.0}],
 '47': ['label=5=12=29=33=41=43=',
  {'5': 0.16

In [89]:
nodedata = new_nodedata

# Random walk sampling

In [90]:
# generate a random int in [start, end]
def get_rnd_int_in_range(rand_gen, start, end):
    r = random.randint(0, 1e10)
    diff = end - start + 1
    rval = rand_gen.hashValueInt(r)%diff
    return rval+start

In [91]:
get_rnd_int_in_range(rand_gen, 0, 1)

0

In [92]:
# generate a random value in (min_val, 1]
def get_rnd_value(rand_gen, min_val):
    if min_val >=1:
        raise Exception("Minimum must be less than 1")
    rval = 0
    while rval < min_val:
        r = random.randint(0, 1e14)
        rval = rand_gen.hashValue(r)
    return rval

In [93]:
get_rnd_value(rand_gen, 1e-6)

0.5303290747660145

In [94]:
# a standard random walk starting from a node for 'depth' hops 
def random_walk(G, node, depth, features, rand_gen):
    node = str(node)
    cnt = 0
    curr_node = node
    while cnt < depth and G.degree[curr_node] > 0:
        nbrs = [curr_node] + [nbr for nbr in G.neighbors(curr_node)]
        curr_node = nbrs[get_rnd_int_in_range(rand_gen, 0, len(nbrs)-1)]
        cnt += 1
    subject, features_node = features[curr_node]
    # return a random feature describing the node
    if len(features_node.values())==0:
        print(features[curr_node])
    random.seed(get_rnd_int_in_range(rand_gen, 0, len(G.nodes())))    
    w = random.choices(population=list(features_node.keys()), weights=list(features_node.values()), k=1)[0]
    return curr_node, subject, w

In [95]:
node = list(H.nodes())[40]
random_walk(H, node, 2, features=nodedata, rand_gen=rand_gen)

('3480', 'label=12=42=', '12')

In [96]:
# for each node generate a number of samples, i.e. the embedding size, by random walks
def all_nodes_random_walk(G, depth, nr_walks, features, rand_gen):
    vectors = {}
    for node in G.nodes():
        vectors[node] = [None for _ in range(nr_walks)]
        for walk in range(nr_walks):
            sample, subject, feature = random_walk(G, node, depth, features, rand_gen)
            vectors[node][walk] = (sample, subject, feature)
    return vectors

In [97]:
# random walks on the full graph
vectors_rw_all = []
for d in range(depth+1):
    randompath = "../random/bits.01"
    rand_gen = TabulationHashing(randompath, rows=4, shift=16)
    start = time.time()
    vectors_rw = all_nodes_random_walk(G, d, emb_size, features=nodedata, rand_gen=rand_gen)
    vectors_rw_all.append(vectors_rw)
    end = time.time()
    print('Elapsed time RW', end-start)

Elapsed time RW 2.4027817249298096
Elapsed time RW 3.6505181789398193
Elapsed time RW 4.691741704940796
Elapsed time RW 6.170156002044678
Elapsed time RW 8.921931505203247


In [98]:
# random walks on the reduced graph used for link prediction
# vectors_rw_reduced = [] 
# for d in range(depth+1):
#     randompath = "../random/bits.01"
#     rand_gen = TabulationHashing(randompath, rows=4, shift=16)
#     vectors_rw = all_nodes_random_walk(H, d, emb_size, features=nodedata, rand_gen=rand_gen)
#     vectors_rw_reduced.append(vectors_rw)

In [99]:
vectors_rw_all[1][list(vectors_rw_all[0].keys())[10]]

[('955', 'Nonelabel=38=', 'none'),
 ('3682', 'label=34=', '34'),
 ('3682', 'label=34=', '34'),
 ('955', 'Nonelabel=38=', 'none'),
 ('955', 'Nonelabel=38=', 'none'),
 ('2137', 'label=8=', '8'),
 ('3682', 'label=34=', '34'),
 ('955', 'Nonelabel=38=', 'none'),
 ('955', 'Nonelabel=38=', 'none'),
 ('955', 'Nonelabel=38=', 'none'),
 ('2137', 'label=8=', '8'),
 ('0', 'Nonelabel=5=15=36=38=45=', 'none'),
 ('955', 'Nonelabel=38=', 'none'),
 ('2137', 'label=8=', '8'),
 ('2137', 'label=8=', '8'),
 ('0', 'Nonelabel=5=15=36=38=45=', 'none'),
 ('1189', 'label=47=', '47'),
 ('3682', 'label=34=', '34'),
 ('1189', 'label=47=', '47'),
 ('1189', 'label=47=', '47'),
 ('0', 'Nonelabel=5=15=36=38=45=', 'none'),
 ('2137', 'label=8=', '8'),
 ('0', 'Nonelabel=5=15=36=38=45=', 'none'),
 ('1189', 'label=47=', '47'),
 ('955', 'Nonelabel=38=', 'none'),
 ('1189', 'label=47=', '47'),
 ('1189', 'label=47=', '47'),
 ('1189', 'label=47=', '47'),
 ('2137', 'label=8=', '8'),
 ('2137', 'label=8=', '8'),
 ('3682', 'label=3

In [100]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_rwalk_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_rw_all[d], outfile)

In [101]:
# for d in range(depth+1):
#     jsonpath = data_dir + "/vectors/vectors_rwalk_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
#     with open(jsonpath, 'w') as outfile:
#         json.dump(vectors_rw_reduced[d], outfile)

# Nodesketch sampling

In [102]:
def update_dict(d, k, rand_gen, min_val, seed):
    random.seed(seed)
    if k not in d:
        d[k] = random.random() #get_rnd_value(rand_gen, min_val)

In [103]:
def ioffe_sampling(arr, weights):
    min_val = 1e6
    node_sample = None
    feature_sample = None
    weight_sample = 0
    label_sample = None
    #for node, feature, weight, label in arr:
    for node, vals in arr.items():
        feature, weight, label = vals[0], vals[1], vals[2]
        rnd_val = -math.log(weights[node])/weight
        if rnd_val < min_val:
            min_val = rnd_val
            node_sample = node
            feature_sample = feature
            weight_sample = weight
            label_sample = label
    return node_sample, feature_sample, weight_sample, label_sample

In [104]:
def update_arr(arr, new_node):
    if new_node[0] in arr:
        arr[new_node[0]] = (new_node[1], arr[new_node[0]][1] + new_node[2], new_node[3])# [1] += new_node[2]
    else:
        arr[new_node[0]] = (new_node[1], new_node[2], new_node[3])
    return arr

In [105]:
def nodesketch_iter(G, nodedata, depth, emb_size, rand_gen):
    
    min_val = 1e-6
    feats_rnd = [{} for _ in range(emb_size)]
    cnt = 0
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, rand_gen, min_val, seed=13*cnt)
                
    node_labels = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        node_labels_i = node_labels[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            arr = {}
            for f, weight_f in feats[1].items():
                arr[f] = (f, weight_f, feats[0])
                #arr.append((f, f, weight_f, feats[0]))
            _, feature_sample, weight_sample, label_sample = ioffe_sampling(arr, feats_rnd_i)
            node_labels_i[node] = (node, feature_sample, weight_sample, label_sample)
            
    print('Sampled features')
    
    node_rnd_vals_all = [{} for _ in range(emb_size)]
    for t in range(emb_size):
        random.seed(1223*t)
        for u in G.nodes():
            node_rnd_vals_all[t][u] = random.random()
            
    node_labels_all = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    node_labels_all[0] = node_labels
    for d in range(depth):
        node_labels_iter = node_labels_all[d]
        print('Iteration', d)
        random.seed(31*d)
        # node_rnd_vals = {}
        
        for t in range(emb_size):
            node_labels_iter_t = node_labels_iter[t]
            #node_rnd_vals = {}
            #node_rnd_vals_t = node_rnd_vals_all[t]
#             random.seed(31*d + 1223*t)
#             for u in G.nodes():
#                 node_rnd_vals[u] = random.random()# get_rnd_value(rand_gen, min_val)
            # print(node_rnd_vals['35'])
            for u in G.nodes():
                node_sample_u, feature_sample_u, weight_sample_u, label_u = node_labels_iter_t[u]
                arr_u = {node_sample_u: (feature_sample_u, weight_sample_u, label_u)} 
                #[(node_sample_u, feature_sample_u, weight_sample_u, label_u)]
                for v in G.neighbors(u):
                    node_sample_v, feature_sample_v, weight_sample_v, label_v = node_labels_iter_t[v]
                    update_arr(arr_u, (node_sample_v, feature_sample_v, weight_sample_v, label_v))
                    #arr_u.append((node_sample_v, feature_sample_v, weight_sample_v, label_v))
                # sample_u, weight_sample_u = ioffe_sampling(arr_u, node_rnd_vals)
                node_labels_all[d+1][t][u] = ioffe_sampling(arr_u, node_rnd_vals_all[t]) 
                
    node_embeddings = [{n:[] for n in G.nodes()} for _ in range(depth+1)]
    for d in range(depth+1):
        for u in G.nodes():
            for nl in node_labels_all[d]:
                node_embeddings[d][u].append((nl[u][0], nl[u][3], nl[u][1]))
    return node_embeddings
                    

In [106]:
randompath = "../random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [107]:
start = time.time()
vectors_ns_all= nodesketch_iter(G, nodedata, depth=depth, emb_size=emb_size, rand_gen=rand_gen)
end = time.time()
print('Elapsed time Nodesketch', end-start)

Sampled features
Iteration 0
Iteration 1
Iteration 2
Iteration 3
Elapsed time Nodesketch 13.246364116668701


In [108]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_nodesketch_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_ns_all[d], outfile)

In [109]:
# vectors_ns_reduced = nodesketch_iter(H, nodedata, depth=depth, emb_size=emb_size, rand_gen=rand_gen)

In [110]:
# for d in range(depth+1):
#     jsonpath = data_dir + "/vectors/vectors_nodesketch_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
#     with open(jsonpath, 'w') as outfile:
#         json.dump(vectors_ns_reduced[d], outfile)

# Min-wise (L0) sampling

In [111]:
# # Iteratively collect the minimum value for each node from its neighbors.
# # k iterations correspond to sampling from the k-hop neighborhood
# def minwise_iterate(G, rnd_nodes, nr_iter): #, features):
#     node_labels = [{} for _ in range(nr_iter+1)]
#     if nr_iter < 1:
#         raise Exception("There must be at least one iteration")
#     node_labels[0] = rnd_nodes
#     for iter in range(nr_iter):
#         rnd_nodes_iter = node_labels[iter]
#         print('Iteration', iter)
#         for u in G.nodes():
#             w_u = rnd_nodes_iter[u]
#             for v in G.neighbors(u):
#                 w_u = min(rnd_nodes_iter[v], w_u)
#             node_labels[iter+1][u] = w_u
#     return node_labels

In [112]:
# initialize random numbers for nodes and features for each embedding 
def init_dicts(nodedata, emb_size, rand_gen=rand_gen):
    min_val = 1e-6
    cnt = 0
    # nodes_rnd = [{} for _ in range(emb_size)]
    # labels_rnd = [{} for _ in range(emb_size)]
    feats_rnd = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        # nodes_rnd_i = nodes_rnd[i]
        # labels_rnd_i = labels_rnd[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            # update_dict(nodes_rnd_i, node, rand_gen, min_val)
            # update_dict(labels_rnd_i, feats[0], rand_gen, min_val)
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, rand_gen, min_val, seed=17*cnt)
    # return nodes_rnd, labels_rnd, feats_rnd
    return feats_rnd

In [113]:
def generate_minwise_samples(G, nodedata, feats_rnd, depth, emb_size):
    node_labels = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        node_labels_i = node_labels[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            min_feature_value = 1e3
            min_feature = None
            for f in feats[1]:
                if feats_rnd_i[f] < min_feature_value:
                    min_feature = f
                    min_feature_value = feats_rnd_i[f]
            node_labels_i[node] = (min_feature_value, node, feats[0], min_feature)
            
    node_labels_all = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    node_labels_all[0] = node_labels
    for d in range(depth):
        node_labels_iter = node_labels_all[d]
        print('Iteration', d)
        for u in G.nodes():
            for t in range(emb_size):
                w_u = node_labels_iter[t][u]
                for v in G.neighbors(u):
                        w_u = min(node_labels_iter[t][v], w_u)
                node_labels_all[d+1][t][u] = w_u
            
    node_embeddings = [{n:[] for n in G.nodes()} for _ in range(depth+1)]
    for d in range(depth+1):
        for u in G.nodes():
            for nl in node_labels_all[d]:
                node_embeddings[d][u].append((nl[u][1], nl[u][2], nl[u][3]))
    return node_embeddings

In [114]:
randompath = "../random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [115]:
start = time.time()
# nodes_rnd, labels_rnd, 
feats_rnd = init_dicts(nodedata, emb_size, rand_gen=rand_gen)
vectors_mw_all = generate_minwise_samples(G, nodedata, feats_rnd, depth=depth, emb_size=emb_size)
end = time.time()
print('Elapsed time MW', end-start)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Elapsed time MW 8.190699577331543


In [116]:
# vectors_mw_all[0]

In [117]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_minwise_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_mw_all[d], outfile)

In [118]:
# vectors_mw_reduced = generate_minwise_samples(H, nodedata, feats_rnd, depth=depth, emb_size=emb_size)

In [119]:
# for d in range(depth+1):
#     jsonpath = data_dir + "/vectors/vectors_minwise_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
#     with open(jsonpath, 'w') as outfile:
#         json.dump(vectors_mw_reduced[d], outfile)

# L1 sampling

In [120]:
# generating L1 samples from the k-hop neighborhood
# top is the summary size of the frequent items mining algorithm
def generate_L1_samples(G, nodedata, rand_gen, depth, emb_size, top):
    
    
    min_val = 1e-6
    feats_rnd = [{} for _ in range(emb_size)]
    cnt = 0
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, rand_gen, min_val, seed=23*cnt)
    
    sketches = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    
    min_val = 1e-6
    cnt = 0
    # generate the random values for each node (attribute)
    for u, (subject, features) in nodedata.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('nodes processed', cnt)
        for i in range(emb_size):
            feats_rnd_i = feats_rnd[i]
            sketches[0][i][u] = {}
            max_w = 0
            max_f = None
            for f, w_f in features.items():
                w_rnd = w_f/feats_rnd_i[f]   # get_rnd_value(rand_gen, min_val) 
                if w_rnd > max_w:
                    max_w = w_rnd
                    max_f = f
            if max_w > 0:
                sketches[0][i][u] = {u : (max_w, max_f)}
            #print(max_f, max_w)
    
    # iterate over neighborhoods and maintain the heaviest nodes
    for d in range(depth):
        print('ITERATION', d)
        for emb in range(emb_size):
            # print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[d][emb])
            # print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                if u not in sketch_iter_emb:
                    continue
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, (w_f, f) in sketch_v.items():
                        sketch_u.setdefault(t, (0, None))
                        weight = sketch_u[t][0] + w_f
                        sketch_u[t] = (weight, f)
                triples = []
                for node, feat_node in sketch_u.items():
                    triples.append((feat_node[0], feat_node[1], node))
                
                # mining heavy hitters
                triples = sorted(triples, reverse=True)
                to_subtract = 0
                if len(triples) > top:
                    to_subtract = triples[top][0]
                top_triples = triples[:top]
                
                #print(top_triples)
                new_sketches[u] = {tr[2] : (tr[0]-to_subtract, tr[1]) for tr in top_triples}
            sketches[d+1][emb] = new_sketches
    return sketches

In [121]:
randompath = "../random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [122]:
top=15

In [123]:
start = time.time()
sketches_l1_all = generate_L1_samples(G, nodedata, rand_gen=rand_gen, depth=depth, emb_size=emb_size, top=top)
end = time.time()
print('Elapsed time L1', end-start)

nodes processed 1000
nodes processed 2000
nodes processed 3000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3
Elapsed time L1 130.57285165786743


In [124]:
# sketches_l1_reduced = generate_L1_samples(H, nodedata, rand_gen, depth=depth, emb_size=emb_size, top=top)

In [125]:
# len(sketches_l1_all[2]), len(sketches_l1_reduced[2])

In [126]:
def get_embeddings_l1_2(nodedata, sketches):
    embeddings = [{} for _ in range(len(sketches))]
    for d in range(len(sketches)):
        for node in nodedata.keys():
            embeddings[d][node] = []
    for d in range(len(sketches)):
        for e in range(emb_size):
            for node, dct in sketches[d][e].items():
                max_word = None
                max_weight = 0
                for sampled_node, ww in dct.items(): # ww: weight word
                    if ww[0] > max_weight:
                        max_word = ww[1]
                        max_weight = ww[0]
                label = nodedata[node][0]
                if max_weight > 0:
                    embeddings[d][node].append((sampled_node, label, max_word))
    return embeddings

In [127]:
vectors_l1_all = get_embeddings_l1_2(nodedata, sketches_l1_all)

In [128]:
# vectors_l1_all 

In [129]:
# vectors_l1_reduced = get_embeddings_l1_2(nodedata, sketches_l1_reduced)

In [130]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l1_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_l1_all[d], outfile)

In [131]:
# for d in range(depth+1):
#     jsonpath = data_dir + "/vectors/vectors_l1_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
#     with open(jsonpath, 'w') as outfile:
#          json.dump(vectors_l1_reduced[d], outfile)

# L2 sampling

In [132]:
# generating L1 samples from the k-hop neighborhood
# top is the summary size of the frequent items mining algorithm
def generate_L2_samples(G, nodedata, rand_gen, depth, emb_size, top):
    
    
    min_val = 1e-6
    cnt = 0
    feats_rnd = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, rand_gen, min_val, seed=cnt)
    
    sketches = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    
    min_val = 1e-6
    cnt = 0
    # generate the random values for each node (attribute)
    for u, (subject, features) in nodedata.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('nodes processed', cnt)
        for i in range(emb_size):
            feats_rnd_i = feats_rnd[i]
            sketches[0][i][u] = {}
            max_w = 0
            max_f = None
            for f, w_f in features.items():
                w_rnd = w_f/math.sqrt(feats_rnd_i[f])   # get_rnd_value(rand_gen, min_val) 
                if w_rnd > max_w:
                    max_w = w_rnd
                    max_f = f
            if max_w > 0:
                sketches[0][i][u] = {u : (max_w, max_f)}
            #print(max_f, max_w)
    
    # iterate over neighborhoods and maintain the heaviest nodes
    for d in range(depth):
        print('ITERATION', d)
        for emb in range(emb_size):
            # print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[d][emb])
            # print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                if u not in sketch_iter_emb:
                    continue
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, (w_f, f) in sketch_v.items():
                        sketch_u.setdefault(t, (0, None))
                        weight = sketch_u[t][0] + w_f
                        sketch_u[t] = (weight, f)
                triples = []
                for node, feat_node in sketch_u.items():
                    triples.append((feat_node[0], feat_node[1], node))
                top_triples = sorted(triples, reverse=True)[:top]
                #print(top_triples)
                new_sketches[u] = {tr[2] : (tr[0], tr[1]) for tr in top_triples}
            sketches[d+1][emb] = new_sketches
    return sketches

In [133]:
randompath = "../random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [134]:
start = time.time()
sketches_l2_all = generate_L2_samples(G, nodedata, rand_gen=rand_gen, depth=depth, emb_size=emb_size, top=top)
end = time.time()
print('Elapsed time L2', end-start)

nodes processed 1000
nodes processed 2000
nodes processed 3000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3
Elapsed time L2 130.24861526489258


In [135]:
# sketches_l2_reduced = generate_L2_samples(H, nodedata, rand_gen, depth=depth, emb_size=emb_size, top=top)

In [136]:
vectors_l2_all = get_embeddings_l1_2(nodedata, sketches_l2_all)

In [137]:
# vectors_l2_reduced = get_embeddings_l1_2(nodedata, sketches_l2_reduced)

In [138]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l2_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_l2_all[d], outfile)

In [139]:
# for d in range(depth+1):
#     jsonpath = data_dir + "/vectors/vectors_l2_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
#     with open(jsonpath, 'w') as outfile:
#         json.dump(vectors_l2_reduced[d], outfile)