# Feature generation

This is the core of COLOGNE where we generate discrete embeddings per node. Below we have implemented the three methods for feature generation: 
- Random walks
- NodeSketch
- Minwise independent sampling (L0)
- L1 sampling
- L2 sampling

In order to run the code the graphs must be preprocessed, please run first the respective notebooks.
More details are provided as comments in the code below.

In [1]:
import os
import sys
import networkx as nx
import pandas as pd
import json
import random
import math
import copy
import time

In [2]:
graphnames = ['Cora', 'Citeseer', 'Pubmed', 'HomoSapiens', 'Wikipedia', 'BlogCatalog']
idx = 0
graphname = graphnames[idx]
emb_size = 50 # how many samples per node to generate 
depth = 4 # the depth of the local neighborhood
data_dir = os.path.expanduser("../Graphs/"+graphname)

In [3]:
nodedata_path = data_dir + "/data/nodedata.json" 
with open(nodedata_path, "r") as read_file:
    nodedata = json.load(read_file)

In [4]:
# an example of node information. The node label and a dictionary of words (with weights) describing the node 
nodedata[list(nodedata.keys())[129]]

['label=Genetic_Algorithms',
 {'w-19': 1, 'w-299': 1, 'w-393': 1, 'w-495': 1, 'w-507': 1, 'w-1263': 1}]

In [5]:
def read_graph_from_edge_list(filename, nodedata):
    G = nx.Graph()
    path = data_dir + "/data/" + filename
    cnt = 0
    with open(path, 'r') as edgefile: # os.path.join(data_dir, filename),
        for line in edgefile:
            cnt += 1
            line_split = line.split(':')
            if len(line_split) > 1:
                l0 = line_split[0]
                l1 = line_split[1]
                u = l0.strip()
                v = l1.strip()
                if u in nodedata and v in nodedata:
                    G.add_edge(u, v)
        
    print(cnt)
    return G
    

In [6]:
G = read_graph_from_edge_list("all_graph_edges.txt", nodedata)

5278


In [7]:
G.number_of_nodes(), G.number_of_edges()

(2708, 5278)

In [8]:
# the graph used for link prediction
H = read_graph_from_edge_list("graph_edges_reduced.txt", nodedata)

4222


In [9]:
H.number_of_nodes(), H.number_of_edges()

(2708, 4222)

In [10]:
R = read_graph_from_edge_list("removed_edges.txt", nodedata)

1056


In [11]:
R.number_of_nodes(), R.number_of_edges()

(1281, 1056)

In [14]:
def get_labels(label):
    if label[:4] == 'None':
        return []
    else:
        labels = label.split('=')
        return labels[1:len(labels)-1]

# Random walk sampling

In [14]:
# generate a random int in [start, end]
def get_rnd_int_in_range(start, end):
    r = random.randint(0, 1e10)
    diff = end - start + 1
    rval = r%diff
    return rval+start

In [24]:
test_cnts = {}
for i in range(100000):
    val = get_rnd_int_in_range(0, 9)
    test_cnts.setdefault(val, 0)
    test_cnts[val] += 1
print(test_cnts)

{8: 10015, 3: 9967, 0: 9930, 7: 10046, 5: 9980, 9: 10043, 2: 9980, 4: 9869, 1: 10168, 6: 10002}


In [30]:
# generate a random value in (min_val, 1]
def get_rnd_value(min_val):
    if min_val >=1:
        raise Exception("Minimum must be less than 1")
    rval = 0
    while rval < min_val:
        rval = random.random()
    return rval

In [42]:
get_rnd_value(1e-6)

0.6062148519886823

In [48]:
# a standard random walk starting from a node for 'depth' hops 
def random_walk(G, node, depth, features):
    node = str(node)
    cnt = 0
    curr_node = node
    while cnt < depth and G.degree[curr_node] > 0:
        nbrs = [curr_node] + [nbr for nbr in G.neighbors(curr_node)]
        curr_node = nbrs[get_rnd_int_in_range(0, len(nbrs)-1)]
        cnt += 1
    subject, features_node = features[curr_node]
    # return a random feature describing the node
    if len(features_node.values())==0:
        print(features[curr_node])
    random.seed(get_rnd_int_in_range(0, len(G.nodes())))    
    w = random.choices(population=list(features_node.keys()), weights=list(features_node.values()), k=1)[0]
    return curr_node, subject, w

In [51]:
node = list(H.nodes())[40]
random_walk(H, node, 2, features=nodedata)

('35', 'label=Genetic_Algorithms', 'w-1249')

In [52]:
# for each node generate a number of samples, i.e. the embedding size, by random walks
def all_nodes_random_walk(G, depth, nr_walks, features):
    vectors = {}
    for node in G.nodes():
        vectors[node] = [None for _ in range(nr_walks)]
        for walk in range(nr_walks):
            sample, subject, feature = random_walk(G, node, depth, features)
            vectors[node][walk] = (sample, subject, feature)
    return vectors

In [54]:
# random walks on the full graph
vectors_rw_all = []
for d in range(depth+1):
    start = time.time()
    vectors_rw = all_nodes_random_walk(G, d, emb_size, features=nodedata)
    vectors_rw_all.append(vectors_rw)
    end = time.time()
    print('Elapsed time RW', end-start)

Elapsed time RW 2.0955722332000732
Elapsed time RW 2.443809747695923
Elapsed time RW 3.187406063079834
Elapsed time RW 4.012030124664307
Elapsed time RW 4.627681016921997


In [55]:
# random walks on the reduced graph used for link prediction
vectors_rw_reduced = [] 
for d in range(depth+1):
    vectors_rw = all_nodes_random_walk(H, d, emb_size, features=nodedata)
    vectors_rw_reduced.append(vectors_rw)

In [56]:
vectors_rw_all[1][list(vectors_rw_all[0].keys())[10]]

[('248425', 'label=Genetic_Algorithms', 'w-495'),
 ('1113831', 'label=Genetic_Algorithms', 'w-1332'),
 ('248431', 'label=Genetic_Algorithms', 'w-495'),
 ('35', 'label=Genetic_Algorithms', 'w-507'),
 ('35', 'label=Genetic_Algorithms', 'w-464'),
 ('35', 'label=Genetic_Algorithms', 'w-191'),
 ('35', 'label=Genetic_Algorithms', 'w-1305'),
 ('35', 'label=Genetic_Algorithms', 'w-1247'),
 ('35', 'label=Genetic_Algorithms', 'w-748'),
 ('248431', 'label=Genetic_Algorithms', 'w-507'),
 ('1113831', 'label=Genetic_Algorithms', 'w-326'),
 ('248425', 'label=Genetic_Algorithms', 'w-1353'),
 ('1113831', 'label=Genetic_Algorithms', 'w-25'),
 ('248425', 'label=Genetic_Algorithms', 'w-1235'),
 ('35', 'label=Genetic_Algorithms', 'w-495'),
 ('248425', 'label=Genetic_Algorithms', 'w-478'),
 ('35', 'label=Genetic_Algorithms', 'w-580'),
 ('35', 'label=Genetic_Algorithms', 'w-495'),
 ('35', 'label=Genetic_Algorithms', 'w-1227'),
 ('248431', 'label=Genetic_Algorithms', 'w-1353'),
 ('248425', 'label=Genetic_Algo

In [57]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_rwalk_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_rw_all[d], outfile)

In [58]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_rwalk_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_rw_reduced[d], outfile)

# Nodesketch sampling

In [60]:
def update_dict(d, k, min_val, seed):
    random.seed(seed)
    if k not in d:
        d[k] = random.random() #get_rnd_value(rand_gen, min_val)

In [61]:
def ioffe_sampling(arr, weights):
    min_val = 1e6
    node_sample = None
    feature_sample = None
    weight_sample = 0
    label_sample = None
    #for node, feature, weight, label in arr:
    for node, vals in arr.items():
        feature, weight, label = vals[0], vals[1], vals[2]
        rnd_val = -math.log(weights[node])/weight
        if rnd_val < min_val:
            min_val = rnd_val
            node_sample = node
            feature_sample = feature
            weight_sample = weight
            label_sample = label
    return node_sample, feature_sample, weight_sample, label_sample

In [62]:
def update_arr(arr, new_node):
    if new_node[0] in arr:
        arr[new_node[0]] = (new_node[1], arr[new_node[0]][1] + new_node[2], new_node[3])# [1] += new_node[2]
    else:
        arr[new_node[0]] = (new_node[1], new_node[2], new_node[3])
    return arr

In [63]:
def nodesketch_iter(G, nodedata, depth, emb_size):
    
    min_val = 1e-6
    feats_rnd = [{} for _ in range(emb_size)]
    cnt = 0
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, min_val, seed=13*cnt)
                
    node_labels = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        node_labels_i = node_labels[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            arr = {}
            for f, weight_f in feats[1].items():
                arr[f] = (f, weight_f, feats[0])
                #arr.append((f, f, weight_f, feats[0]))
            _, feature_sample, weight_sample, label_sample = ioffe_sampling(arr, feats_rnd_i)
            node_labels_i[node] = (node, feature_sample, weight_sample, label_sample)
            
    print('Sampled features')
    
    node_rnd_vals_all = [{} for _ in range(emb_size)]
    for t in range(emb_size):
        random.seed(1223*t)
        for u in G.nodes():
            node_rnd_vals_all[t][u] = random.random()
            
    node_labels_all = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    node_labels_all[0] = node_labels
    for d in range(depth):
        node_labels_iter = node_labels_all[d]
        print('Iteration', d)
        random.seed(31*d)
        # node_rnd_vals = {}
        
        for t in range(emb_size):
            node_labels_iter_t = node_labels_iter[t]
            #node_rnd_vals = {}
            #node_rnd_vals_t = node_rnd_vals_all[t]
#             random.seed(31*d + 1223*t)
#             for u in G.nodes():
#                 node_rnd_vals[u] = random.random()# get_rnd_value(rand_gen, min_val)
            # print(node_rnd_vals['35'])
            for u in G.nodes():
                node_sample_u, feature_sample_u, weight_sample_u, label_u = node_labels_iter_t[u]
                arr_u = {node_sample_u: (feature_sample_u, weight_sample_u, label_u)} 
                #[(node_sample_u, feature_sample_u, weight_sample_u, label_u)]
                for v in G.neighbors(u):
                    node_sample_v, feature_sample_v, weight_sample_v, label_v = node_labels_iter_t[v]
                    update_arr(arr_u, (node_sample_v, feature_sample_v, weight_sample_v, label_v))
                    #arr_u.append((node_sample_v, feature_sample_v, weight_sample_v, label_v))
                # sample_u, weight_sample_u = ioffe_sampling(arr_u, node_rnd_vals)
                node_labels_all[d+1][t][u] = ioffe_sampling(arr_u, node_rnd_vals_all[t]) 
                
    node_embeddings = [{n:[] for n in G.nodes()} for _ in range(depth+1)]
    for d in range(depth+1):
        for u in G.nodes():
            for nl in node_labels_all[d]:
                node_embeddings[d][u].append((nl[u][0], nl[u][3], nl[u][1]))
    return node_embeddings
                    

In [66]:
start = time.time()
vectors_ns_all= nodesketch_iter(G, nodedata, depth=depth, emb_size=emb_size)
end = time.time()
print('Elapsed time Nodesketch', end-start)

Sampled features
Iteration 0
Iteration 1
Iteration 2
Iteration 3
Elapsed time Nodesketch 23.64945077896118


In [67]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_nodesketch_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_ns_all[d], outfile)

In [69]:
vectors_ns_reduced = nodesketch_iter(H, nodedata, depth=depth, emb_size=emb_size)

Sampled features
Iteration 0
Iteration 1
Iteration 2
Iteration 3


In [70]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_nodesketch_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_ns_reduced[d], outfile)

# Min-wise (L0) sampling

In [71]:
# # Iteratively collect the minimum value for each node from its neighbors.
# # k iterations correspond to sampling from the k-hop neighborhood
# def minwise_iterate(G, rnd_nodes, nr_iter): #, features):
#     node_labels = [{} for _ in range(nr_iter+1)]
#     if nr_iter < 1:
#         raise Exception("There must be at least one iteration")
#     node_labels[0] = rnd_nodes
#     for iter in range(nr_iter):
#         rnd_nodes_iter = node_labels[iter]
#         print('Iteration', iter)
#         for u in G.nodes():
#             w_u = rnd_nodes_iter[u]
#             for v in G.neighbors(u):
#                 w_u = min(rnd_nodes_iter[v], w_u)
#             node_labels[iter+1][u] = w_u
#     return node_labels

In [73]:
# initialize random numbers for nodes and features for each embedding 
def init_dicts(nodedata, emb_size):
    min_val = 1e-6
    cnt = 0
    # nodes_rnd = [{} for _ in range(emb_size)]
    # labels_rnd = [{} for _ in range(emb_size)]
    feats_rnd = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        # nodes_rnd_i = nodes_rnd[i]
        # labels_rnd_i = labels_rnd[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            # update_dict(nodes_rnd_i, node, rand_gen, min_val)
            # update_dict(labels_rnd_i, feats[0], rand_gen, min_val)
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, min_val, seed=17*cnt)
    # return nodes_rnd, labels_rnd, feats_rnd
    return feats_rnd

In [74]:
def generate_minwise_samples(G, nodedata, feats_rnd, depth, emb_size):
    node_labels = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        node_labels_i = node_labels[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            min_feature_value = 1e3
            min_feature = None
            for f in feats[1]:
                if feats_rnd_i[f] < min_feature_value:
                    min_feature = f
                    min_feature_value = feats_rnd_i[f]
            node_labels_i[node] = (min_feature_value, node, feats[0], min_feature)
            
    node_labels_all = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    node_labels_all[0] = node_labels
    for d in range(depth):
        node_labels_iter = node_labels_all[d]
        print('Iteration', d)
        for u in G.nodes():
            for t in range(emb_size):
                w_u = node_labels_iter[t][u]
                for v in G.neighbors(u):
                        w_u = min(node_labels_iter[t][v], w_u)
                node_labels_all[d+1][t][u] = w_u
            
    node_embeddings = [{n:[] for n in G.nodes()} for _ in range(depth+1)]
    for d in range(depth+1):
        for u in G.nodes():
            for nl in node_labels_all[d]:
                node_embeddings[d][u].append((nl[u][1], nl[u][2], nl[u][3]))
    return node_embeddings

In [76]:
start = time.time()
# nodes_rnd, labels_rnd, 
feats_rnd = init_dicts(nodedata, emb_size)
vectors_mw_all = generate_minwise_samples(G, nodedata, feats_rnd, depth=depth, emb_size=emb_size)
end = time.time()
print('Elapsed time MW', end-start)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Elapsed time MW 19.19777226448059


In [77]:
# vectors_mw_all[0]

In [78]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_minwise_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_mw_all[d], outfile)

In [79]:
vectors_mw_reduced = generate_minwise_samples(H, nodedata, feats_rnd, depth=depth, emb_size=emb_size)

Iteration 0
Iteration 1
Iteration 2
Iteration 3


In [80]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_minwise_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_mw_reduced[d], outfile)

# L1 sampling

In [84]:
# generating L1 samples from the k-hop neighborhood
# top is the summary size of the frequent items mining algorithm
def generate_L1_samples(G, nodedata, depth, emb_size, top):
    
    
    min_val = 1e-6
    feats_rnd = [{} for _ in range(emb_size)]
    cnt = 0
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, min_val, seed=23*cnt)
    
    sketches = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    
    min_val = 1e-6
    cnt = 0
    # generate the random values for each node (attribute)
    for u, (subject, features) in nodedata.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('nodes processed', cnt)
        for i in range(emb_size):
            feats_rnd_i = feats_rnd[i]
            sketches[0][i][u] = {}
            max_w = 0
            max_f = None
            for f, w_f in features.items():
                w_rnd = w_f/feats_rnd_i[f]   # get_rnd_value(rand_gen, min_val) 
                if w_rnd > max_w:
                    max_w = w_rnd
                    max_f = f
            if max_w > 0:
                sketches[0][i][u] = {u : (max_w, max_f)}
            #print(max_f, max_w)
    
    # iterate over neighborhoods and maintain the heaviest nodes
    for d in range(depth):
        print('ITERATION', d)
        for emb in range(emb_size):
            # print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[d][emb])
            # print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                if u not in sketch_iter_emb:
                    continue
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, (w_f, f) in sketch_v.items():
                        sketch_u.setdefault(t, (0, None))
                        weight = sketch_u[t][0] + w_f
                        sketch_u[t] = (weight, f)
                triples = []
                for node, feat_node in sketch_u.items():
                    triples.append((feat_node[0], feat_node[1], node))
                
                # mining heavy hitters
                triples = sorted(triples, reverse=True)
                to_subtract = 0
                if len(triples) > top:
                    to_subtract = triples[top][0]
                top_triples = triples[:top]
                
                #print(top_triples)
                new_sketches[u] = {tr[2] : (tr[0]-to_subtract, tr[1]) for tr in top_triples}
            sketches[d+1][emb] = new_sketches
    return sketches

In [85]:
top=10

In [86]:
start = time.time()
sketches_l1_all = generate_L1_samples(G, nodedata, depth=depth, emb_size=emb_size, top=top)
end = time.time()
print('Elapsed time L1', end-start)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3
Elapsed time L1 51.26462650299072


In [88]:
sketches_l1_reduced = generate_L1_samples(H, nodedata, depth=depth, emb_size=emb_size, top=top)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3


In [89]:
len(sketches_l1_all[2]), len(sketches_l1_reduced[2])

(50, 50)

In [90]:
def get_embeddings_l1_2(nodedata, sketches):
    embeddings = [{} for _ in range(len(sketches))]
    for d in range(len(sketches)):
        for node in nodedata.keys():
            embeddings[d][node] = []
    for d in range(len(sketches)):
        for e in range(emb_size):
            for node, dct in sketches[d][e].items():
                max_word = None
                max_weight = 0
                for sampled_node, ww in dct.items(): # ww: weight word
                    if ww[0] > max_weight:
                        max_word = ww[1]
                        max_weight = ww[0]
                label = nodedata[node][0]
                if max_weight > 0:
                    embeddings[d][node].append((sampled_node, label, max_word))
    return embeddings

In [91]:
vectors_l1_all = get_embeddings_l1_2(nodedata, sketches_l1_all)

In [92]:
# vectors_l1_all 

In [93]:
vectors_l1_reduced = get_embeddings_l1_2(nodedata, sketches_l1_reduced)

In [94]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l1_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_l1_all[d], outfile)

In [95]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l1_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
         json.dump(vectors_l1_reduced[d], outfile)

# L2 sampling

In [96]:
# generating L1 samples from the k-hop neighborhood
# top is the summary size of the frequent items mining algorithm
def generate_L2_samples(G, nodedata, depth, emb_size, top):
    
    
    min_val = 1e-6
    cnt = 0
    feats_rnd = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            for f, weight_f in feats[1].items():
                cnt += 1
                update_dict(feats_rnd_i, f, min_val, seed=cnt)
    
    sketches = [[{} for _ in range(emb_size)] for _ in range(depth+1)]
    
    min_val = 1e-6
    cnt = 0
    # generate the random values for each node (attribute)
    for u, (subject, features) in nodedata.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('nodes processed', cnt)
        for i in range(emb_size):
            feats_rnd_i = feats_rnd[i]
            sketches[0][i][u] = {}
            max_w = 0
            max_f = None
            for f, w_f in features.items():
                w_rnd = w_f/math.sqrt(feats_rnd_i[f])   # get_rnd_value(rand_gen, min_val) 
                if w_rnd > max_w:
                    max_w = w_rnd
                    max_f = f
            if max_w > 0:
                sketches[0][i][u] = {u : (max_w, max_f)}
            #print(max_f, max_w)
    
    # iterate over neighborhoods and maintain the heaviest nodes
    for d in range(depth):
        print('ITERATION', d)
        for emb in range(emb_size):
            # print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[d][emb])
            # print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                if u not in sketch_iter_emb:
                    continue
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, (w_f, f) in sketch_v.items():
                        sketch_u.setdefault(t, (0, None))
                        weight = sketch_u[t][0] + w_f
                        sketch_u[t] = (weight, f)
                triples = []
                for node, feat_node in sketch_u.items():
                    triples.append((feat_node[0], feat_node[1], node))
                top_triples = sorted(triples, reverse=True)[:top]
                #print(top_triples)
                new_sketches[u] = {tr[2] : (tr[0], tr[1]) for tr in top_triples}
            sketches[d+1][emb] = new_sketches
    return sketches

In [97]:
start = time.time()
sketches_l2_all = generate_L2_samples(G, nodedata, depth=depth, emb_size=emb_size, top=top)
end = time.time()
print('Elapsed time L2', end-start)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3
Elapsed time L2 55.881694316864014


In [99]:
sketches_l2_reduced = generate_L2_samples(H, nodedata, depth=depth, emb_size=emb_size, top=top)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1
ITERATION 2
ITERATION 3


In [100]:
vectors_l2_all = get_embeddings_l1_2(nodedata, sketches_l2_all)

In [101]:
vectors_l2_reduced = get_embeddings_l1_2(nodedata, sketches_l2_reduced)

In [102]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l2_all_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_l2_all[d], outfile)

In [103]:
for d in range(depth+1):
    jsonpath = data_dir + "/vectors/vectors_l2_reduced_" + str(emb_size) + "_hop_" + str(d) + ".json"
    with open(jsonpath, 'w') as outfile:
        json.dump(vectors_l2_reduced[d], outfile)