## Graph preprocessing

We convert the the three citation graphs into a common format consisting of a list of edges and a dictionary for each node with different attributes such as its class and word features.

The graphs can be downloaded from https://linqs.soe.ucsc.edu/data

In [1]:
import os
import networkx as nx
import pandas as pd
import numpy as np
import json
import copy
from BinaryStream import BinaryStream

In [2]:
graphnames = ['Cora', 'Citeseer']
graphname = graphnames[1]
data_dir = os.path.expanduser("../Graphs/" + graphname)

edges_df = pd.read_csv(os.path.join(data_dir, graphname.lower() + ".cites"), \
                       sep='\t', header=None, names=["target", "source"])

In [3]:
# randompath = "../random/bits.01"
# file = open(randompath, 'rb')
# stream = BinaryStream(file)

In [4]:
content = pd.read_csv(os.path.join(data_dir, graphname.lower() +".content"), \
                        sep='\t', header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
content.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3695,3696,3697,3698,3699,3700,3701,3702,3703,3704
0,100157,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
1,100598,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,IR
2,105684,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents


In [6]:
feature_names = ["w-{}".format(ii) for ii in range(content.shape[1]-2)]
column_names =  ['node'] + feature_names + ["label"]
nodedata_df = pd.read_csv(os.path.join(data_dir, graphname.lower() +".content"), \
                        sep='\t',  names=column_names)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
def get_rnd_int_in_range(stream, start, end):
    diff = end - start + 1
    rval = stream.readUInt64()%diff
    return rval+start

In [8]:
np.random.randint(0, 5)

4

In [9]:
nodedata_df.head(3)

Unnamed: 0,node,w-0,w-1,w-2,w-3,w-4,w-5,w-6,w-7,w-8,...,w-3694,w-3695,w-3696,w-3697,w-3698,w-3699,w-3700,w-3701,w-3702,label
0,100157,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
1,100598,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,IR
2,105684,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents


In [10]:
nodedata_df['node'].nunique()

3312

In [11]:
#nodedata_df[['node', 'label']].to_csv(data_dir + '/data/nodes_with_labels.csv', index=False)

In [12]:
edges_df.head(3)

Unnamed: 0,target,source
0,100157,100157
1,100157,364207
2,100157,38848


In [13]:
edges_df.shape

(4732, 2)

In [14]:
nodedata_df['node'].nunique()

3312

In [15]:
nodedata = {}
for idx, row in nodedata_df.iterrows():
    nodedata[str(row['node'])] = ('label=' + row['label'], {})
    if len(nodedata) % 500 == 0:
        print(len(nodedata))
    for c in nodedata_df.columns:
        if c[0] == 'w':
            if row[c] != 0:
                nodedata[str(row['node'])][1][c] =1

500
1000
1500
2000
2500
3000


In [16]:
ndata_df = pd.DataFrame()
rows = []
for node, feats in nodedata.items():
    row = {}
    row['node'] = node
    row['label'] = feats[0]
    rows.append(row)
ndata_df = pd.DataFrame(rows)    
ndata_df.to_csv(data_dir + '/data/nodes_with_labels.csv', index=False)    

In [17]:
jsonpath = data_dir + "/data/nodedata.json" #os.path.join(data_dir, "/data/nodedata.json")
print(jsonpath)
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata, outfile)

../Graphs/Citeseer/data/nodedata.json


In [18]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = data_dir + "/data/graph_nodes.txt"#os.path.join(data_dir, "graph_nodes.txt")
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(str(node) + '\n')
        
labels_path = data_dir + "/data/labels.txt" #os.path.join(data_dir, "labels.txt")
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = data_dir + "/data/words_indices.txt" # os.path.join(data_dir, "words_indices.txt")
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')
        
edges = []
for idx, row in edges_df.iterrows():
    edges.append((str(row['target']).strip(), str(row['source']).strip()))

In [19]:
edges = []
for idx, row in edges_df.iterrows():
    edges.append((str(row['target']).strip(), str(row['source']).strip()))
    
G = nx.Graph()
for edge in edges:
    u = edge[0]
    v = edge[1]
    if u in nodedata and v in nodedata:
        G.add_edge(u, v)

In [20]:
G.number_of_nodes(), G.number_of_edges()

(2708, 5278)

In [21]:
largest_cc = max(nx.connected_components(G), key=len)
CC = G.subgraph(largest_cc).copy()
nx.algorithms.distance_measures.diameter(CC)

19

In [20]:
removed_edges = set()
H = copy.deepcopy(G)
threshold = 0.2
while len(removed_edges) < threshold*G.number_of_edges():
    if len(removed_edges)%500 == 0:
        print(len(removed_edges), G.number_of_edges())
    i = np.random.randint(0, H.number_of_edges()-1) #get_rnd_int_in_range(stream, 0, H.number_of_edges()-1)
    edge = list(H.edges())[i]
    u = edge[0]
    v = edge[1]
    #if H.degree[u] > 1 and H.degree[v] > 1:
    H.remove_edge(u, v)
    removed_edges.add((u, v))

0 4660
500 4660


In [21]:
edges_path = data_dir + "/data/all_graph_edges.txt" 
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [22]:
edges_path = data_dir + "/data/reduced_graph_edges.txt"
with open(edges_path, 'w') as outfile:
    for edge in H.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [23]:
if len(removed_edges) > 0:
    removed_edges_path = data_dir + "/data/removed_edges.txt" 
    with open(removed_edges_path, 'w') as outfile:
        for edge in removed_edges:
            outfile.write(edge[0] + ':' + edge[1] + '\n')

## Pubmed preprocessing

In [37]:
graphname = 'Pubmed'
data_dir = os.path.expanduser("../Graphs/"+graphname)

In [38]:
nodes = set()
edges = []
with open(os.path.join(data_dir, graphname.lower() + ".cites"), 'r') as edgefile:
    for line in edgefile:
        line_split = line.split('|')
        if len(line_split) > 1:
            l0 = line_split[0]
            l1 = line_split[1]
            u = l0.split(':')[1]
            v = l1.split(':')[1]
            nodes.add(str(u).strip())
            nodes.add(str(v).strip())
            edges.append((str(u).strip(), str(v).strip()))

In [39]:
nodedata = {}
with open(os.path.join(data_dir, graphname.lower() + ".content"), 'r') as contentfile:
    for line in contentfile:
        line_split = line.split()
        if len(line_split) < 3:
            continue
        if line_split[0] not in nodes:
            continue
        nodewords = {}
        for i in range(2, len(line_split)):
            w = line_split[i]
            w_split = w.split('=')
            if w_split[0] == 'summary':
                continue
            nodewords[w.split('=')[0]] = float(w.split('=')[1])
        nodedata[line_split[0]] = (line_split[1], nodewords)

In [40]:
len(nodedata)

19717

In [41]:
jsonpath = data_dir + "/data/nodedata.json" 
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata, outfile)

In [42]:
nodedata_df = pd.DataFrame()
rows = []
for node, feats in nodedata.items():
    row = {}
    row['node'] = node
    row['label'] = feats[0]
    rows.append(row)
nodedata_df = pd.DataFrame(rows)    
nodedata_df.to_csv(data_dir + '/data/nodes_with_labels.csv', index=False)    

In [43]:
nodedata

{'12187484': ('label=1',
  {'w-rat': 0.09393489570187145,
   'w-common': 0.028698458467273157,
   'w-use': 0.01176012652514843,
   'w-examin': 0.019375414753592942,
   'w-pathogenesi': 0.06316131961800078,
   'w-retinopathi': 0.17089058531360632,
   'w-mous': 0.06770248034355311,
   'w-studi': 0.017554610474374233,
   'w-anim': 0.09840151241009497,
   'w-model': 0.06269133038832954,
   'w-metabol': 0.06232233318170418,
   'w-abnorm': 0.11247870345628387,
   'w-contribut': 0.02534773765067718,
   'w-develop': 0.030388826051908086,
   'w-investig': 0.02014612607562432,
   'w-mice': 0.12119873074191996,
   'w-2': 0.020571546813213402,
   'w-month': 0.10361986739277738,
   'w-compar': 0.02367140886552208,
   'w-obtain': 0.03061978039959059,
   'w-method': 0.014469342700659771,
   'w-induc': 0.023516442702830022,
   'w-6': 0.014872498687869398,
   'w-inject': 0.028054999329982466,
   'w-experiment': 0.06866787644053303,
   'w-normal': 0.01777754779525323,
   'w-diet': 0.031956203604979944,


In [44]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = data_dir + "/data/graph_nodes.txt" 
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(str(node) + '\n')
        
labels_path = data_dir + "/data/labels.txt" 
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = data_dir + "/data/words_indices.txt" 
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')

edges_path = os.path.join(data_dir, "edges.txt")
with open(edges_path, 'w') as outfile:
    for e in edges:
        outfile.write(e[0] + '-' + e[1] + '\n')

In [45]:
G = nx.Graph()
for edge in edges:
    u = edge[0]
    v = edge[1]
    if u in nodedata and v in nodedata:
        G.add_edge(u, v)

In [46]:
largest_cc = max(nx.connected_components(G), key=len)
CC = G.subgraph(largest_cc).copy()
nx.algorithms.distance_measures.diameter(CC)

18

In [47]:
removed_edges = set()
H = copy.deepcopy(G)
threshold = 0.2
while len(removed_edges) < threshold*G.number_of_edges():
    if len(removed_edges)%1000 == 0:
        print(len(removed_edges), G.number_of_edges())
    i = np.random.randint(0, H.number_of_edges()-1) # get_rnd_int_in_range(stream, 0, H.number_of_edges()-1)
    edge = list(H.edges())[i]
    u = edge[0]
    v = edge[1]
    if H.degree[u] > 1 and H.degree[v] > 1:
        H.remove_edge(u, v)
        removed_edges.add((u, v))

0 44327
1000 44327
1000 44327
2000 44327
2000 44327
3000 44327
4000 44327
5000 44327
6000 44327
6000 44327
6000 44327
7000 44327
8000 44327


In [48]:
edges_path = data_dir + "/data/all_graph_edges.txt" 
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [49]:
edges_path = data_dir + "/data/graph_edges_reduced.txt" 
with open(edges_path, 'w') as outfile:
    for edge in H.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [50]:
if len(removed_edges) > 0:
    removed_edges_path = data_dir + "/data/removed_edges.txt"  
    with open(removed_edges_path, 'w') as outfile:
        for edge in removed_edges:
            outfile.write(edge[0] + ':' + edge[1] + '\n')