In [1]:
import os
import sys
import networkx as nx
import pandas as pd
import json
import random
import copy
import time
from BinaryStream import BinaryStream
from TabulationHashing import TabulationHashing

In [2]:
graphnames = ['Cora', 'Citeseer', 'Pubmed']
graphname = graphnames[0]
emb_size = 50
data_dir = os.path.expanduser("Graphs/"+graphname)

In [3]:
randompath = "random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [4]:
nodedata_path = data_dir + "/data/nodedata.json" #os.path.join(data_dir, "nodedata.json")
with open(nodedata_path, "r") as read_file:
    nodedata = json.load(read_file)

In [5]:
 nodedata[list(nodedata.keys())[10]]

['label=Neural_Networks',
 {'w-118': 1,
  'w-171': 1,
  'w-533': 1,
  'w-820': 1,
  'w-1209': 1,
  'w-1241': 1,
  'w-1273': 1}]

In [6]:
def read_graph_from_edge_list(filename, nodedata):
    G = nx.Graph()
    path = data_dir + "/data/" + filename
    with open(path, 'r') as edgefile: # os.path.join(data_dir, filename),
        for line in edgefile:
            line_split = line.split(':')
            if len(line_split) > 1:
                l0 = line_split[0]
                l1 = line_split[1]
                u = l0.strip()
                v = l1.strip()
                if u in nodedata and v in nodedata:
                    G.add_edge(u, v)
    return G
    

In [7]:
G = read_graph_from_edge_list("all_graph_edges.txt", nodedata)

In [8]:
G.number_of_nodes(), G.number_of_edges()

(2708, 5278)

In [9]:
H = read_graph_from_edge_list("graph_edges_reduced.txt", nodedata)

In [10]:
H.number_of_nodes(), H.number_of_edges()

(2708, 4222)

In [11]:
R = read_graph_from_edge_list("removed_edges.txt", nodedata)

In [12]:
R.number_of_nodes(), R.number_of_edges()

(1281, 1056)

In [13]:
# a random int in [start, end]
def get_rnd_int_in_range(rand_gen, start, end):
    r = random.randint(0, 1e10)
    diff = end - start + 1
    rval = rand_gen.hashValueInt(r)%diff
    return rval+start

In [14]:
get_rnd_int_in_range(rand_gen, 0, 100)

4

In [15]:
def get_rnd_value(rand_gen, min_val):
    rval = 0
    while rval < min_val:
        r = random.randint(0, 1e10)
        rval = rand_gen.hashValue(r)
    return rval

In [16]:
get_rnd_value(rand_gen, 1e-6)

0.25111363822251553

In [17]:
def random_walk(G, node, depth, features, rand_gen):
    node = str(node)
    cnt = 0
    curr_node = node
    while cnt < depth and G.degree[curr_node] > 0:
        nbrs = [nbr for nbr in G.neighbors(curr_node)]
        curr_node = nbrs[get_rnd_int_in_range(rand_gen, 0, len(nbrs)-1)]
        cnt += 1
    subject, features_node = features[curr_node]
    w = random.choices(population=list(features_node.keys()), weights=list(features_node.values()), k=1)[0]
    return curr_node, subject, w

In [18]:
node = list(H.nodes())[20]
random_walk(H, node, 2, features=nodedata, rand_gen=rand_gen)

('87417', 'label=Genetic_Algorithms', 'w-250')

In [19]:
def all_nodes_random_walk(G, depth, nr_walks, features, rand_gen):
    vectors = {}
    for node in G.nodes():
        vectors[node] = [None for _ in range(nr_walks)]
        for walk in range(nr_walks):
            sample, subject, feature = random_walk(G, node, depth, features, rand_gen)
            vectors[node][walk] = (sample, subject, feature)
    return vectors

In [20]:
start = time.time()
vectors_rw_all = all_nodes_random_walk(G, 2, emb_size, features=nodedata, rand_gen=rand_gen)
end = time.time()
print('Elapsed time RW', end-start)

Elapsed time RW 2.049431800842285


In [21]:
vectors_rw_reduced = all_nodes_random_walk(H, 2, emb_size, features=nodedata, rand_gen=rand_gen)

In [22]:
vectors_rw_all[list(vectors_rw_all.keys())[100]][:3]

[('575292', 'label=Genetic_Algorithms', 'w-1198'),
 ('262178', 'label=Genetic_Algorithms', 'w-333'),
 ('427606', 'label=Genetic_Algorithms', 'w-464')]

In [23]:
jsonpath = data_dir + "/vectors/vectors_rwalk_all_" + str(emb_size) + ".json"
#os.path.join(data_dir, "vectors_rwalk_all_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_rw_all, outfile)

In [24]:
jsonpath = data_dir + "/vectors/vectors_rwalk_reduced_" + str(emb_size) + ".json"
# os.path.join(data_dir, "vectors_rwalk_reduced_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_rw_reduced, outfile)

In [25]:
def minwise_iterate(G, rnd_nodes, nr_iter, features):
    node_labels = [{} for _ in range(nr_iter+1)]
    if nr_iter < 1:
        raise Exception("There must be at least one iteration")
    node_labels[0] = rnd_nodes
    for iter in range(nr_iter):
        rnd_nodes_iter = node_labels[iter]
        print('Iteration', iter)
        for u in G.nodes():
            w_u = rnd_nodes_iter[u]
            for v in G.neighbors(u):
                w_u = min(rnd_nodes_iter[v], w_u)
            node_labels[iter+1][u] = w_u
    return node_labels

In [26]:
def update_dict(d, k, rand_gen, min_val):
    if k not in d:
        d[k] = get_rnd_value(rand_gen, min_val)

In [27]:
# initialize random numbers for nodes and features for each embedding 
def init_dicts(nodedata, emb_size, rand_gen=rand_gen):
    min_val = 1e-6
    nodes_rnd = [{} for _ in range(emb_size)]
    labels_rnd = [{} for _ in range(emb_size)]
    feats_rnd = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        nodes_rnd_i = nodes_rnd[i]
        labels_rnd_i = labels_rnd[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            update_dict(nodes_rnd_i, node, rand_gen, min_val)
            update_dict(labels_rnd_i, feats[0], rand_gen, min_val)
            for f, weight_f in feats[1].items():
                update_dict(feats_rnd_i, f, rand_gen, min_val)
    return nodes_rnd, labels_rnd, feats_rnd

In [28]:
# nodes_rnd, labels_rnd, feats_rnd = init_dicts(nodedata, emb_size)

In [29]:
# print(labels_rnd[:2])

[{'label=Neural_Networks': 0.5744633343192777, 'label=Rule_Learning': 0.23460164579904508, 'label=Reinforcement_Learning': 0.20869038102312337, 'label=Probabilistic_Methods': 0.7762539606541157, 'label=Theory': 0.8341305829247562, 'label=Genetic_Algorithms': 0.41657855166284, 'label=Case_Based': 0.007728665922007075}, {'label=Neural_Networks': 0.5307272817214225, 'label=Rule_Learning': 0.8372250791511366, 'label=Reinforcement_Learning': 0.10686022834511369, 'label=Probabilistic_Methods': 0.3498957911309028, 'label=Theory': 0.7839438127140653, 'label=Genetic_Algorithms': 0.5477515560814709, 'label=Case_Based': 0.2032043860703288}]


In [30]:
def generate_minwise_samples(G, nodedata, feats_rnd, nr_iter, emb_size):
    node_labels = [{} for _ in range(emb_size)]
    for i in range(emb_size):
        node_labels_i = node_labels[i]
        feats_rnd_i = feats_rnd[i]
        for node, feats in nodedata.items():
            min_feature_value = 1e3
            min_feature = None
            for f in feats[1]:
                if feats_rnd_i[f] < min_feature_value:
                    min_feature = f
                    min_feature_value = feats_rnd_i[f]
            node_labels_i[node] = (min_feature_value, node, feats[0], min_feature)
            
    node_labels_all = [[{} for _ in range(emb_size)] for _ in range(nr_iter+1)]
    node_labels_all[0] = node_labels
    for i in range(nr_iter):
        node_labels_iter = node_labels_all[i]
        print('Iteration', i)
        for u in G.nodes():
            for t in range(emb_size):
                w_u = node_labels_iter[t][u]
                for v in G.neighbors(u):
                        w_u = min(node_labels_iter[t][v], w_u)
                node_labels_all[i+1][t][u] = w_u
            
    node_embeddings = {n:[] for n in G.nodes()}
    for u in G.nodes():
        for nl in node_labels_all[nr_iter]:
            node_embeddings[u].append((nl[u][1], nl[u][2], nl[u][3]))
    return node_embeddings

In [31]:
start = time.time()
nodes_rnd, labels_rnd, feats_rnd = init_dicts(nodedata, emb_size)
vectors_mw_all = generate_minwise_samples(G, nodedata, feats_rnd, nr_iter=2, emb_size=emb_size)
end = time.time()
print('Elapsed time MW', end-start)

Iteration 0
Iteration 1
Elapsed time MW 0.7884116172790527


In [32]:
jsonpath = data_dir + "/vectors/vectors_minwise_all_" + str(emb_size) + ".json"
#os.path.join(data_dir, "vectors_minwise_all_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_mw_all, outfile)

In [33]:
vectors_mw_reduced = generate_minwise_samples(H, nodedata, feats_rnd, nr_iter=2, emb_size=emb_size)

Iteration 0
Iteration 1


In [34]:
jsonpath = data_dir + "/vectors/vectors_minwise_reduced_" + str(emb_size) + ".json"
# os.path.join(data_dir, "vectors_minwise_reduced_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_mw_reduced, outfile)

In [35]:
def generate_L1_samples(G, nodedata, rand_gen, nr_iter, emb_size, top):
    sketches = [[{} for _ in range(emb_size)] for _ in range(nr_iter+1)]
    
    min_val = 1e-6
    cnt = 0
    for u, (subject, features) in nodedata.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('nodes processed', cnt)
        for i in range(emb_size):
            sketches[0][i][u] = {}
            max_w = 0
            max_f = None
            for f, w_f in features.items():
                w_rnd = w_f/get_rnd_value(rand_gen, min_val) #random.random()
                if w_rnd > max_w:
                    max_w = w_rnd
                    max_f = f
            if max_w > 0:
                sketches[0][i][u] = {u : (max_w, max_f)}
            #print(max_f, max_w)
    
    for iter in range(nr_iter):
        print('ITERATION', iter)
        for emb in range(emb_size):
            # print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[iter][emb])
            # print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                if u not in sketch_iter_emb:
                    continue
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, (w_f, f) in sketch_v.items():
                        sketch_u.setdefault(t, (0, None))
                        weight = sketch_u[t][0] + w_f
                        sketch_u[t] = (weight, f)
                triples = []
                for node, feat_node in sketch_u.items():
                    triples.append((feat_node[0], feat_node[1], node))
                top_triples = sorted(triples, reverse=True)[:top]
                #print(top_triples)
                new_sketches[u] = {tr[2] : (tr[0], tr[1]) for tr in top_triples}
            sketches[iter+1][emb] = new_sketches
    return sketches

In [36]:
randompath = "random/bits.01"
rand_gen = TabulationHashing(randompath, rows=4, shift=16)

In [37]:
start = time.time()
sketches_l1_all = generate_L1_samples(G, nodedata, rand_gen, nr_iter=2, emb_size=emb_size, top=10)
end = time.time()
print('Elapsed time L1', end-start)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1
Elapsed time L1 18.305689811706543


In [38]:
sketches_l1_reduced = generate_L1_samples(H, nodedata, rand_gen, nr_iter=2, emb_size=emb_size, top=10)

nodes processed 1000
nodes processed 2000
ITERATION 0
ITERATION 1


In [39]:
len(sketches_l1_all[2]), len(sketches_l1_reduced[2])

(50, 50)

In [40]:
def get_embeddings_l1(nodedata, sketches):
    embeddings = {}
    for node in nodedata.keys():
        embeddings[node] = []
    for e in range(emb_size):
        for node, d in sketches[2][e].items():
            max_word = None
            max_weight = 0
            for sampled_node, ww in d.items(): # ww: weight word
                if ww[0] > max_weight:
                    max_word = ww[1]
                    max_weight = ww[0]
            label = nodedata[node][0]
            if max_weight > 0:
                embeddings[node].append((sampled_node, label, max_word))
    return embeddings

In [41]:
vectors_l1_all = get_embeddings_l1(nodedata, sketches_l1_all)

In [42]:
vectors_l1_reduced = get_embeddings_l1(nodedata, sketches_l1_reduced)

In [43]:
jsonpath = data_dir + "/vectors/vectors_l1_all_" + str(emb_size) + ".json"
# os.path.join(data_dir, "vectors_l1_all_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_l1_all, outfile)

In [44]:
jsonpath = data_dir + "/vectors/vectors_l1_reduced_" + str(emb_size) + ".json"
#os.path.join(data_dir, "vectors_l1_reduced_" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors_l1_reduced, outfile)