In [1]:
import os
import sys
import networkx as nx
import pandas as pd
import json
import random
import copy
from BinaryStream import BinaryStream

In [4]:
graphname = 'Pubmed'
emb_size = 50
data_dir = os.path.expanduser("/home/koki/Desktop/Data/Graphs/"+graphname)

In [5]:
G = nx.Graph()
with open(os.path.join(data_dir, graphname.lower() + ".cites"), 'r') as edgefile:
    for line in edgefile:
        line_split = line.split('|')
        if len(line_split) > 1:
            l0 = line_split[0]
            l1 = line_split[1]
            u = l0.split(':')[1]
            v = l1.split(':')[1]
            G.add_edge(str(u).strip(), str(v).strip())
        

In [6]:
nodedata = {}
with open(os.path.join(data_dir, graphname.lower() + ".content"), 'r') as contentfile:
    for line in contentfile:
        line_split = line.split()
        if len(line_split) < 3:
            continue
        if line_split[0] not in G.nodes():
            continue
        nodewords = {}
        for i in range(2, len(line_split)):
            w = line_split[i]
            w_split = w.split('=')
            if w_split[0] == 'summary':
                continue
            nodewords[w.split('=')[0]] = float(w.split('=')[1])
        nodedata[line_split[0]] = (line_split[1], nodewords)
    

In [7]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = os.path.join(data_dir, "graph_nodes.txt")
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(node + '\n')
        
labels_path = os.path.join(data_dir, "labels.txt")
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = os.path.join(data_dir, "words_indices.txt")
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')

In [8]:
randompath = "/home/koki/Desktop/Data/random/bits.02"
file = open(randompath, 'rb')
stream = BinaryStream(file)
removed_edges = set()
H = copy.deepcopy(G)
while len(removed_edges) < 0.1*G.number_of_edges():
    if len(removed_edges)%500 == 0:
        print(len(removed_edges), G.number_of_edges())
    i = random.randint(0, H.number_of_edges())
    edge = list(H.edges)[i]
    u = edge[0]
    v = edge[1]
    if H.degree[u] > 1 and H.degree[v] > 1:
        H.remove_edge(u, v)
        removed_edges.add((u, v))
G = copy.deepcopy(H)

0 44327
500 44327
1000 44327
1500 44327
2000 44327
2000 44327
2500 44327
3000 44327
3000 44327
3000 44327
3000 44327
3500 44327
4000 44327


In [9]:
len(removed_edges)

4433

In [10]:
edges_path = os.path.join(data_dir, "graph_edges.txt")
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [11]:
removed_edges_path = os.path.join(data_dir, "removed_edges.txt")
with open(removed_edges_path, 'w') as outfile:
    for edge in removed_edges:
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [12]:
subject, feats = nodedata['12187484']

In [13]:
feats

{'w-rat': 0.09393489570187145,
 'w-common': 0.028698458467273157,
 'w-use': 0.01176012652514843,
 'w-examin': 0.019375414753592942,
 'w-pathogenesi': 0.06316131961800078,
 'w-retinopathi': 0.17089058531360632,
 'w-mous': 0.06770248034355311,
 'w-studi': 0.017554610474374233,
 'w-anim': 0.09840151241009497,
 'w-model': 0.06269133038832954,
 'w-metabol': 0.06232233318170418,
 'w-abnorm': 0.11247870345628387,
 'w-contribut': 0.02534773765067718,
 'w-develop': 0.030388826051908086,
 'w-investig': 0.02014612607562432,
 'w-mice': 0.12119873074191996,
 'w-2': 0.020571546813213402,
 'w-month': 0.10361986739277738,
 'w-compar': 0.02367140886552208,
 'w-obtain': 0.03061978039959059,
 'w-method': 0.014469342700659771,
 'w-induc': 0.023516442702830022,
 'w-6': 0.014872498687869398,
 'w-inject': 0.028054999329982466,
 'w-experiment': 0.06866787644053303,
 'w-normal': 0.01777754779525323,
 'w-diet': 0.031956203604979944,
 'w-30': 0.02512131278693402,
 'w-hyperglycemia': 0.02896081409449482,
 'w-leve

In [14]:
samples = {}
for i in range(1000):
    w = random.choices(population=list(feats.keys()), weights=list(feats.values()), k=1)[0]
    samples.setdefault(w, 0)
    samples[w] += 1

for w, c in samples.items():
    if c > 50:
        print(w, c)

w-retinopathi 72
w-oxid 55
w-anim 51
w-abnorm 69
w-mice 58


In [15]:
def random_walk(G, node, depth, features):
    node = str(node)
    cnt = 0
    curr_node = node
    while cnt < depth and G.degree[curr_node] > 0:
        nbrs = [nbr for nbr in G.neighbors(curr_node)]
        curr_node = nbrs[random.randint(0, len(nbrs)-1)]
        cnt += 1
    subject, features_node = features[curr_node]
    w = random.choices(population=list(features_node.keys()), weights=list(features_node.values()), k=1)[0]
    return curr_node, subject, w

In [16]:
node = list(G.nodes())[230]
random_walk(G, node, 2, features=nodedata)

('3485683', 'label=2', 'w-anim')

In [17]:
def all_nodes_random_walk(G, depth, nr_walks, features):
    vectors = {}
    for node in G.nodes():
        vectors[node] = [None for _ in range(nr_walks)]
        for walk in range(nr_walks):
            sample, subject, feature = random_walk(G, node, depth, features)
            vectors[node][walk] = (sample, subject, feature)
    return vectors

In [18]:
vectors = all_nodes_random_walk(G, 2, emb_size, features=nodedata)

In [19]:
vectors[list(vectors.keys())[10000]][:4]

[('6452065', 'label=1', 'w-differ'),
 ('6452065', 'label=1', 'w-respect'),
 ('9764597', 'label=1', 'w-express'),
 ('9764597', 'label=1', 'w-express')]

In [20]:
jsonpath = os.path.join(data_dir, "vectors_rwalk" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(vectors, outfile)

In [21]:
def get_rnd_value(stream, min_val):
    rval = 0
    while rval < min_val:
        rval = stream.readUInt64()/(2*sys.maxsize)
    return rval

In [22]:
def minwise_iterate(G, rnd_nodes, nr_iter, features):
    node_labels = [{} for _ in range(nr_iter+1)]
    if nr_iter < 1:
        raise Exception("There must be at least one iteration")
    node_labels[0] = rnd_nodes
    for iter in range(nr_iter):
        rnd_nodes_iter = node_labels[iter]
        print('Iteration', iter)
        for u in G.nodes():
            w_u = rnd_nodes_iter[u]
            for v in G.neighbors(u):
                w_u = min(rnd_nodes_iter[v], w_u)
            node_labels[iter+1][u] = w_u
    return node_labels

In [23]:
def update_dict(d, k, stream, min_val):
    if k not in d:
        d[k] = get_rnd_value(stream, min_val)

In [24]:
# initialize random numbers for nodes and features for each embedding 
randompath = "/home/koki/Desktop/Data/random/merged"
file = open(randompath, 'rb')
stream = BinaryStream(file)
min_val = 1e-6
nodes_rnd = [{} for _ in range(emb_size)]
cat_rnd = [{} for _ in range(emb_size)]
feats_rnd = [{} for _ in range(emb_size)]
for i in range(emb_size):
    nodes_rnd_i = nodes_rnd[i]
    cat_rnd_i = cat_rnd[i]
    feats_rnd_i = feats_rnd[i]
    for node, feats in nodedata.items():
        update_dict(nodes_rnd_i, node, stream, min_val)
        update_dict(cat_rnd_i, feats[0], stream, min_val)
        for f, weight_f in feats[1].items():
            update_dict(feats_rnd_i, f, stream, min_val)
print(cat_rnd[:2])

[{'label=1': 0.37134512586272445, 'label=2': 0.7348662116490369, 'label=3': 0.06556687340200722}, {'label=1': 0.7356201732172408, 'label=2': 0.9963940057090247, 'label=3': 0.7311762590078548}]


In [25]:
# initialize nodes with a sampled feature
node_labels = [{} for _ in range(emb_size)]
for i in range(emb_size):
    node_labels_i = node_labels[i]
    feats_rnd_i = feats_rnd[i]
    for node, feats in nodedata.items():
        min_feature_value = 1e3
        min_feature = None
        for f in feats[1]:
            if feats_rnd_i[f] < min_feature_value:
                min_feature = f
                min_feature_value = feats_rnd_i[f]
        node_labels_i[node] = (min_feature_value, node, feats[0], min_feature)

In [26]:
nr_iter = 2
node_labels_all = [[{} for _ in range(emb_size)] for _ in range(nr_iter+1)]
node_labels_all[0] = node_labels
for i in range(nr_iter):
    node_labels_iter = node_labels_all[i]
    print('Iteration', i)
    for u in G.nodes():
        for t in range(emb_size):
            w_u = node_labels_iter[t][u]
            for v in G.neighbors(u):
                    w_u = min(node_labels_iter[t][v], w_u)
            node_labels_all[i+1][t][u] = w_u
            
node_embeddings = {n:[] for n in G.nodes()}
for u in G.nodes():
    for nl in node_labels_all[nr_iter]:
        node_embeddings[u].append((nl[u][1], nl[u][2], nl[u][3]))

Iteration 0
Iteration 1


In [27]:
jsonpath = os.path.join(data_dir, "vectors_minwise" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(node_embeddings, outfile)

In [28]:
node = list(G.nodes())[2030]
node_embeddings[node][:5]

[('6200477', 'label=1', 'w-spontan'),
 ('1350902', 'label=1', 'w-liver'),
 ('1350902', 'label=1', 'w-therefor'),
 ('3777161', 'label=1', 'w-wherea'),
 ('1350902', 'label=1', 'w-rat')]

In [29]:
def update_sketch(sketch, cap, u, w_u_cnt, w_u_over):
    sketch_new = copy.deepcopy(sketch)
    if u in sketch:
        sketch_new[u]['cnt'] += w_u_cnt
        sketch_new[u]['over'] += w_u_over
    else:
        if len(sketch_new) > cap:
            min_node = None
            min_cnt = -1
            for v, est_v in sketch_new.items():
                if min_cnt == -1 or est_v['cnt'] <= min_cnt:
                    min_node = v
                    min_cnt = est_v['cnt']
            del sketch_new[min_node]
            sketch_new[u] = {'cnt' : w_u_cnt+min_cnt, 'over' : w_u_over+min_cnt}
        else:
            sketch_new[u] = {'cnt' : w_u_cnt, 'over' : w_u_over}
    return sketch_new

In [30]:
def generate_L1_samples(G, nodedata, nr_iter, capacity):
    sketches = [{} for _ in range(nr_iter+1)]
    for u, (subject, features) in nodedata.items():
        sketches[0][u] = {}
        for f, w_f in features.items():
            sketches[0][u][f] = {'cnt' : w_f/random.random(), 'over' : 0}
    #sketches[0] = {u : {u : {'cnt' : 1/r, 'over' : 0}} for u, features in nodedata.items()}
     
    for iter in range(nr_iter):
        print('ITERATION', iter)
        sketch_iter = copy.deepcopy(sketches[iter])
        new_sketches = {}
        cnt = 0
        for u in G.nodes():
            cnt += 1
            if cnt % 100 == 0:
                print(cnt)
            sketch_u = sketch_iter[u]
            for v in G.neighbors(u):
                sketch_v = sketch_iter[v]
                for t, w_t in sketch_v.items():
                    sketch_u = update_sketch(sketch_u, capacity, t, w_t['cnt'], w_t['over'])
            new_sketches[u] = sketch_u
#         print('updating sketches', iter+1)
#         print(len(new_sketches))
        sketches[iter+1] = new_sketches
    return sketches

In [32]:
# sketches = generate_L1_samples(G, nodedata, 2, 100)

In [33]:
len(nodedata)

19717

In [47]:
def generate_L1_samples_heuristic(G, nodedata, nr_iter, embsize, top):
    sketches = [[{} for _ in range(embsize)] for _ in range(nr_iter+1)]
    for u, (subject, features) in nodedata.items():
        for i in range(embsize):
            sketches[0][i][u] = {}
            max_w = 0
            for f, w_f in features.items():
                sketches[0][i][u][f] = w_f/random.random()
    
    for iter in range(nr_iter):
        print('ITERATION', iter)
        for emb in range(embsize):
            print('emb', emb)
            sketch_iter_emb = copy.deepcopy(sketches[iter][emb])
            print(len(sketch_iter_emb))
            new_sketches = {}
            for u in G.nodes():
                sketch_u = copy.deepcopy(sketch_iter_emb[u])
                for v in G.neighbors(u):
                    sketch_v = sketch_iter_emb[v]
                    for t, w_t in sketch_v.items():
                        sketch_u.setdefault(t, 0)
                        sketch_u[t] += w_t
#                 max_node = u
#                 max_w = 0
                pairs = []
                for feat, feat_w in sketch_u.items():
                    pairs.append((feat_w, feat))
                top_pairs = sorted(pairs, reverse=True)[:top]
#                     if node_w > max_w:
#                         max_node = node
#                         max_w = node_w
#                     # sketch_u = update_sketch(sketch_u, capacity, t, w_t['cnt'], w_t['over'])

                new_sketches[u] = {n : w for w, n in top_pairs}
            sketches[iter+1][emb] = new_sketches
    return sketches

In [50]:
sketches = generate_L1_samples_heuristic(G, nodedata, nr_iter=2, embsize=50, top=5)

ITERATION 0
emb 0
19717
emb 1
19717
emb 2
19717
emb 3
19717
emb 4
19717
emb 5
19717
emb 6
19717
emb 7
19717
emb 8
19717
emb 9
19717
emb 10
19717
emb 11
19717
emb 12
19717
emb 13
19717
emb 14
19717
emb 15
19717
emb 16
19717
emb 17
19717
emb 18
19717
emb 19
19717
emb 20
19717
emb 21
19717
emb 22
19717
emb 23
19717
emb 24
19717
emb 25
19717
emb 26
19717
emb 27
19717
emb 28
19717
emb 29
19717
emb 30
19717
emb 31
19717
emb 32
19717
emb 33
19717
emb 34
19717
emb 35
19717
emb 36
19717
emb 37
19717
emb 38
19717
emb 39
19717
emb 40
19717
emb 41
19717
emb 42
19717
emb 43
19717
emb 44
19717
emb 45
19717
emb 46
19717
emb 47
19717
emb 48
19717
emb 49
19717
ITERATION 1
emb 0
19717
emb 1
19717
emb 2
19717
emb 3
19717
emb 4
19717
emb 5
19717
emb 6
19717
emb 7
19717
emb 8
19717
emb 9
19717
emb 10
19717
emb 11
19717
emb 12
19717
emb 13
19717
emb 14
19717
emb 15
19717
emb 16
19717
emb 17
19717
emb 18
19717
emb 19
19717
emb 20
19717
emb 21
19717
emb 22
19717
emb 23
19717
emb 24
19717
emb 25
19717
emb 26
1

In [54]:
for k, w in sketches[2][1]['12187484'].items():
    print(k, w)

w-metabol 224.69174734108432
w-normal 158.5176170204383
w-rat 94.44323365788468
w-phenotyp 77.6615874242988
w-detect 64.2937102704091


In [55]:
embeddings = {}
for node in nodedata.keys():
    embeddings[node] = [None for _ in range(3)]
for e in range(3):
    for node, d in sketches[2][e].items():
        max_word = None
        max_weight = 0
        for word, weight in d.items():
            if weight > max_weight:
                max_word = word
                max_weight = weight
        embeddings[node][e] = (max_word, max_weight)

In [56]:
jsonpath = os.path.join(data_dir, "vectors_l1" + str(emb_size) + ".json")
with open(jsonpath, 'w') as outfile:
    json.dump(embeddings, outfile)

In [78]:
G.degree['8543793']

13

In [89]:
nodedata['12187484']

('label=1',
 {'w-rat': 0.09393489570187145,
  'w-common': 0.028698458467273157,
  'w-use': 0.01176012652514843,
  'w-examin': 0.019375414753592942,
  'w-pathogenesi': 0.06316131961800078,
  'w-retinopathi': 0.17089058531360632,
  'w-mous': 0.06770248034355311,
  'w-studi': 0.017554610474374233,
  'w-anim': 0.09840151241009497,
  'w-model': 0.06269133038832954,
  'w-metabol': 0.06232233318170418,
  'w-abnorm': 0.11247870345628387,
  'w-contribut': 0.02534773765067718,
  'w-develop': 0.030388826051908086,
  'w-investig': 0.02014612607562432,
  'w-mice': 0.12119873074191996,
  'w-2': 0.020571546813213402,
  'w-month': 0.10361986739277738,
  'w-compar': 0.02367140886552208,
  'w-obtain': 0.03061978039959059,
  'w-method': 0.014469342700659771,
  'w-induc': 0.023516442702830022,
  'w-6': 0.014872498687869398,
  'w-inject': 0.028054999329982466,
  'w-experiment': 0.06866787644053303,
  'w-normal': 0.01777754779525323,
  'w-diet': 0.031956203604979944,
  'w-30': 0.02512131278693402,
  'w-hype