In [None]:
import scipy.io
from scipy.sparse import csc_matrix
import os
import networkx as nx
import pandas as pd
import json
import copy
import numpy as np

In [None]:
graphname = 'BlogCatalog'
data_dir = os.path.expanduser("../Graphs/")

In [None]:
nodes = set()
edges = []
G = nx.Graph()
with open(os.path.join(data_dir, graphname + "/edges.csv"), 'r') as edgefile:
    for line in edgefile:
        u = line.split(',')[0]
        v = line.split(',')[1]
        u = str(u).strip()
        v = str(v).strip()
        nodes.add(u)
        nodes.add(v)
        edges.append((u, v))
        G.add_edge(u, v)

In [None]:
list(G.nodes())[:5]

In [None]:
nodedata = {}
nodelabels = {}
with open(os.path.join(data_dir, graphname + "/group-edges.csv"), 'r') as contentfile:
    for line in contentfile:
        line_split = line.split(',')
        node = line_split[0].strip()
        label = line_split[1].strip()
        if node not in nodes:
            print('No node', line)
            continue
        nodelabels.setdefault(node, "label=")
        nodelabels[node] += label + "="

for node, label in nodelabels.items():
    rnd = random.random()
    if rnd >= 0.8:
        label = "None" + label
    nodedata[node] = (label, {node: 1})   
    
# for edge in edges:
#     nodedata[edge[0]][1][edge[1]] = 1
#     nodedata[edge[1]][1][edge[0]] = 1

In [None]:
jsonpath = data_dir + graphname + "/data/nodedata.json" #os.path.join(data_dir, "/data/nodedata.json")
print(jsonpath)
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata, outfile)

In [None]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = data_dir + graphname + "/data/graph_nodes.txt"#os.path.join(data_dir, "graph_nodes.txt")
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(str(node) + '\n')
        
labels_path = data_dir + graphname + "/data/labels.txt" #os.path.join(data_dir, "labels.txt")
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = data_dir + graphname + "/data/words_indices.txt" # os.path.join(data_dir, "words_indices.txt")
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')

In [None]:
nodedata_df = pd.DataFrame()
rows = []
for node, feats in nodedata.items():
    row = {}
    row['node'] = node
    row['label'] = feats[0]
    rows.append(row)
nodedata_df = pd.DataFrame(rows)    
nodedata_df.to_csv(data_dir + graphname + '/data/nodes_with_labels.csv', index=False) 

In [None]:
G = nx.Graph()
for edge in edges:
    u = edge[0]
    v = edge[1]
    if u in nodedata and v in nodedata:
        G.add_edge(u, v)

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
edges_path = data_dir + graphname + "/data/all_graph_edges.txt" 
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [None]:
largest_cc = max(nx.connected_components(G), key=len)
CC = G.subgraph(largest_cc).copy()
nx.algorithms.distance_measures.diameter(CC)

In [None]:
removed_edges = set()
H = copy.deepcopy(G)
threshold = 0.02
while len(removed_edges) < threshold*G.number_of_edges():
    if len(removed_edges)%1000 == 0:
        print(len(removed_edges), G.number_of_edges())
    i = np.random.randint(low=0, high=H.number_of_edges())
    edge = list(H.edges())[i]
    u = edge[0]
    v = edge[1]
    if H.degree[u] > 1 and H.degree[v] > 1:
        H.remove_edge(u, v)
        removed_edges.add((u, v))

In [None]:
edges_path = data_dir + graphname + "/data/graph_edges_reduced.txt"
with open(edges_path, 'w') as outfile:
    for edge in H.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [None]:
if len(removed_edges) > 0:
    removed_edges_path = data_dir + graphname + "/data/removed_edges.txt" 
    with open(removed_edges_path, 'w') as outfile:
        for edge in removed_edges:
            outfile.write(edge[0] + ':' + edge[1] + '\n')