## 1. Load Dataset.

In [1]:
from dataloader import load_citation, load_citation_v2, load_new_data
import time
start_time = time.time()
dataf = "../data/"
norm_type = "SymNorm_tildeA"
#norm_type = "sym_normalized_A"
#original_graph, L, features, labels, idx_train,idx_val, idx_test = load_citation_v2(dataf,"Photo",norm_type=norm_type,cuda=False, identity_features=False)
original_graph, L, features, labels, idx_train,idx_val, idx_test, graph_additional_package, data_package = load_new_data(dataf,"cora_ml",cuda=False,norm_type=norm_type,identity_features=False)
name = "Cora-ML"
print(time.time() - start_time)

0.5351672172546387


## 2. Common Function Definition, including Calculating SPD and KFS.

In [6]:
import collections 
import time
import pickle as pkl
import os
def constructNeighborSet(graph_in):
    neighbor_set = {}
    senders = graph_in["senders"]
    receivers = graph_in["receivers"]
    for i in range(graph_in["node_num"]):
        neighbor_set[i] = []
    for i in range(len(senders)):
        send_node = senders[i]
        if not send_node in neighbor_set:
            neighbor_set[send_node] = []
        neighbor_set[send_node].append(receivers[i])
    return neighbor_set

def normalized(dist):
    total = sum(dist)
    if total == 0:
        return dist
    else:
        dist = [value / total for value in dist]
        return dist
def getShortestPathDistanceNodes(node_num, neighbor_set, anchor_list, labels):
    shortest_path_list = []
    anchor_set = set(anchor_list)
    num_class = max(labels)+1
    #print(anchor_set)
    for i in range(node_num):
        de = collections.deque([[i,0]])
        shortest_path_distance = "inf"
        shortest_path_train_nodes = []
        mask = [False for i in range(node_num)]
        class_distribution = [0 for i in range(num_class)]
        while len(de)>0:
            curr = de.popleft()
            mask[curr[0]] = True
            if curr[0] in anchor_set:
                if shortest_path_distance == "inf":
                    #shortest_path_train_nodes = [curr[0]]
                    class_distribution[labels[curr[0]]] = class_distribution[labels[curr[0]]] + 1
                    shortest_path_distance = curr[1]
                elif curr[1] == shortest_path_distance:
                    class_distribution[labels[curr[0]]] = class_distribution[labels[curr[0]]] + 1
                    #shortest_path_train_nodes.append(curr[0])
                else:
                    break
            else:
                #if curr[1]+1<=2:
                neighbors = neighbor_set[curr[0]]
                for j in neighbors:
                    if not mask[j]:
                        de.append([j, curr[1]+1])
        shortest_path_list.append({
            "dis":shortest_path_distance,
            "train_nodes":normalized(class_distribution)
        })    
    return shortest_path_list
def getShortestPathDistance(original_graph, idx_train, labels):
    graph_in = {
        "node_num":original_graph.size()[0],
        "senders":original_graph._indices()[1].tolist(),
        "receivers":original_graph._indices()[0].tolist()
    }
    node_num = graph_in["node_num"]
    cora_gcn_neighbor_set = constructNeighborSet(graph_in)
    shortest_path_list = getShortestPathDistanceNodes(node_num, cora_gcn_neighbor_set, idx_train.tolist(), labels.tolist())
    return shortest_path_list
def getSavePath(dataf, data_name):
    return dataf+"{}/{}_SPD.pkl".format(data_name, data_name)
def saveJson(obj, savePath):
    with open(savePath, "wb") as f:
        pkl.dump(obj, f)
def loadJson(savePath):
    with open(savePath, "rb") as f:
        obj = pkl.load(f)
    return obj
def getSPDJson(original_graph, idx_train, labels, dataf, data_name):
    savePath = getSavePath(dataf, data_name)
    existence = os.path.isfile(savePath)
    if existence:
        return loadJson(savePath)
    else:
        SPD = getShortestPathDistance(original_graph, idx_train, labels)
        saveJson(SPD, savePath)
        return SPD

In [None]:
SPD = getSPDJson(original_graph, idx_train, labels, dataf, name)

In [40]:
import torch
def calculateJaccardDistance(a,b):
    intersection = a*b
    union = 1-(1-a)*(1-b)
    inter_sum = intersection.sum()
    union_sum = union.sum()
    print(a.tolist(),b.tolist())
    #print(intersection.tolist(), union.tolist())
    return inter_sum / union_sum
def calculateCosDistance(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n = a.norm()
    b_n = b.norm()
    a_norm = a/ torch.max(a_n, eps * torch.ones(1))
    b_norm = b/ torch.max(b_n, eps * torch.ones(1))
    sim_mt_1 = a_norm * b_norm
    sim_mt = sim_mt_1.sum()
    return sim_mt
def getTopkFeatureSimilaritySet(features, anchor_list, labels, k=5):
    node_num = features.shape[0]
    feature_sim_set = []
    num_class = max(labels)+1
    for i in range(node_num):
        feature_similarity_list = []
        
        for anchor in anchor_list:
            jd = calculateCosDistance(features[i], features[anchor])
            feature_similarity_list.append({
                "anchor_id": anchor,
                "anchor_label": labels[anchor],
                "anchor_similarity": jd.item()
            })
        feature_similarity_list = sorted(feature_similarity_list, key=lambda ele: ele["anchor_similarity"], reverse=True)
        #print(feature_similarity_list)
        #break
        feature_similarity_list = feature_similarity_list[:k]
        class_distribution = [0 for i in range(num_class)]
        for item in feature_similarity_list:
            label = item["anchor_label"]
            class_distribution[label] = class_distribution[label] + 1
        feature_sim_set.append({
            "train_nodes":normalized(class_distribution),
            "details":feature_similarity_list
        })
        #print(feature_sim_set)
    return feature_sim_set
def getKFSSavePath(dataf, data_name):
    return dataf+"{}/{}_KFS.pkl".format(data_name, data_name)
def getKFSJson(features, idx_train, labels, dataf, data_name, k=5):
    savePath = getKFSSavePath(dataf, data_name)
    #existence = os.path.isfile(savePath)
    #if existence:
    #    return loadJson(savePath)
    #else:
    KFS = getTopkFeatureSimilaritySet(features, idx_train.tolist(), labels.tolist(), k)
    #    saveJson(KFS, savePath)
    return KFS

In [41]:
KFS = getKFSJson(features, idx_train, labels, dataf, name, 5)

In [42]:
print(KFS[1211])

{'train_nodes': [0.0, 0.6, 0.0, 0.0, 0.4, 0.0, 0.0], 'details': [{'anchor_id': 1151, 'anchor_label': 1, 'anchor_similarity': 0.1616433560848236}, {'anchor_id': 1066, 'anchor_label': 1, 'anchor_similarity': 0.1566666215658188}, {'anchor_id': 874, 'anchor_label': 4, 'anchor_similarity': 0.14626014232635498}, {'anchor_id': 2238, 'anchor_label': 4, 'anchor_similarity': 0.13114015758037567}, {'anchor_id': 2419, 'anchor_label': 1, 'anchor_similarity': 0.12761421501636505}]}


In [43]:
print(KFS[1210])

{'train_nodes': [0.0, 0.6, 0.0, 0.0, 0.2, 0.0, 0.2], 'details': [{'anchor_id': 1151, 'anchor_label': 1, 'anchor_similarity': 0.14745144546031952}, {'anchor_id': 1066, 'anchor_label': 1, 'anchor_similarity': 0.1429116427898407}, {'anchor_id': 1490, 'anchor_label': 6, 'anchor_similarity': 0.1344635933637619}, {'anchor_id': 874, 'anchor_label': 4, 'anchor_similarity': 0.13341885805130005}, {'anchor_id': 2419, 'anchor_label': 1, 'anchor_similarity': 0.12534424662590027}]}


# -----------------------------------  Backup Code -------------------------------  # 

In [None]:
import collections 
import time
start_time = time.time()
def getShortestPathDistance(node_num, neighbor_set, anchor_list):
    shortest_path_list = []
    anchor_set = set(anchor_list)
    #print(anchor_set)
    for i in range(node_num):
        de = collections.deque([[i,0]])
        shortest_path_distance = "inf"
        mask = [False for i in range(node_num)]
        while len(de)>0:
            curr = de.popleft()
            mask[curr[0]] = True
            if curr[0] in anchor_set:
                shortest_path_distance = curr[1]
                break
            else:
                #if curr[1]+1<=2:
                neighbors = neighbor_set[curr[0]]
                for j in neighbors:
                    if not mask[j]:
                        de.append([j, curr[1]+1])
        shortest_path_list.append(shortest_path_distance)    
    return shortest_path_list
node_num = graph_in["node_num"]
shortest_path_list = getShortestPathDistance(node_num, cora_gcn_neighbor_set, idx_train.tolist())
print(time.time() - start_time)

In [None]:
import collections 
import time
start_time = time.time()
def normalized(dist):
    total = sum(dist)
    if total == 0:
        return dist
    else:
        dist = [value / total for value in dist]
        return dist
def getShortestPathDistanceNodes(node_num, neighbor_set, anchor_list, labels):
    shortest_path_list = []
    #anchor_set = set(anchor_list)
    num_class = max(labels)+1
    #print(anchor_set)
    for i in range(node_num):
        shortest_path_list.append({
            "shortest_path_distance":"inf",
            "shortest_path_train_nodes":[]
        })
    for anchor in anchor_list:
        de = collections.deque([[anchor,0]])
        #shortest_path_distance = "inf"
        #shortest_path_train_nodes = []
        mask = [False for i in range(node_num)]
        #class_distribution = [0 for i in range(num_class)]
        while len(de)>0:
            curr = de.popleft()
            mask[curr[0]] = True
            sp = shortest_path_list[curr[0]]
            sp_distance = sp["shortest_path_distance"]
            if sp_distance == "inf" or sp_distance > curr[1]:
                shortest_path_list[curr[0]]["shortest_path_distance"]=curr[1]
                shortest_path_list[curr[0]]["shortest_path_train_nodes"]=[anchor]
                #shortest_path_train_nodes = [curr[0]]
                #class_distribution[labels[curr[0]]] = class_distribution[labels[curr[0]]] + 1
                #shortest_path_distance = curr[1]
            elif curr[1] == sp_distance:
                shortest_path_list[curr[0]]["shortest_path_train_nodes"].append(anchor)
                #class_distribution[labels[curr[0]]] = class_distribution[labels[curr[0]]] + 1
                #shortest_path_train_nodes.append(curr[0])

            neighbors = neighbor_set[curr[0]]
            for j in neighbors:
                if not mask[j]:
                    de.append([j, curr[1]+1])
        
    return shortest_path_list
node_num = graph_in["node_num"]
shortest_path_list = getShortestPathDistanceNodes(node_num, cora_gcn_neighbor_set, idx_train.tolist(), labels.tolist())
print(time.time() - start_time)

In [None]:
from dataloader import load_new_data
import time
start_time = time.time()
dataf = "../data/"
norm_type = "SymNorm_tildeA"
#norm_type = "sym_normalized_A"
original_graph, L, features, labels, idx_train,idx_val, idx_test, graph_info = load_new_data(dataf,"polblogs",norm_type=norm_type,cuda=False, identity_features=True)
name = "polblogs"
print(time.time() - start_time)

In [None]:
print(features.shape)

In [None]:
print(graph_info)


In [None]:
/github/GNNVis/server/data/g2g

In [None]:
import time
import argparse
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from models import GCN
from sklearn.metrics import f1_score
from utils import accuracy, set_seed


model_path = "../models/gcn_photo_state.pkt"
#model_path = "../models/mlp_cora_state.pkt"
#torch.save(model.state_dict(), model_path)
#args = [1433,16,7,0.5]
#args = [3703,16,6,0.5]
#args = [500, 16, 3, 0.5]
#args = [1222, 16, 2, 0.5]
args = [745, 16, 8, 0.5]
kwargs = {
    "bias": True,
}
model = GCN(*args,**kwargs)
model.load_state_dict(torch.load(model_path, map_location='cpu'))

model.eval()
output = model(features,L)
loss_test = F.nll_loss(output[idx_test],labels[idx_test])
acc_test = accuracy(output[idx_test],labels[idx_test])
print(loss_test.item(), acc_test.item())



In [None]:
features = features.requires_grad_()
output = model(features,L)
N = 645
node_relevance = torch.zeros_like(output)
node_relevance[N] = 1
output.backward(node_relevance)

node_feature_importance = features.grad
node_importance = features.grad.pow(2).sum(dim=1)
node_importance = node_importance.tolist()

#for i in range(len(node_importance)):
#    if not node_importance[i] == 0:
#        pass
#        #print(i, node_importance[i])


In [None]:
features_importance = features.grad[645]
features_importance = features_importance.tolist()
feature_index = sorted(range(len(features_importance)), key=lambda k: -features_importance[k])
for index in feature_index:
    print(index, features_importance[index])


In [None]:
graph_out = net(batch)[0]

N = node_no
node_relevance = torch.zeros_like(graph_out.node_features)
node_relevance[N] = 1

graph_in.zero_grad_()
graph_out.node_features.backward(node_relevance)

node_importance = batch.node_features.grad.pow(2).sum(dim=1)
edge_importance = batch.edge_features.grad.pow(2).sum(dim=1)
return node_importance, edge_importance

In [None]:
model.state_dict()["gc1.weight"]

In [None]:
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set()
#uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(model.state_dict()["gc2.weight"].cpu().numpy())
fig = ax.get_figure()
fig.savefig("gc2_weight.png")

In [None]:
print(original_graph._indices()[0])

In [None]:
print(original_graph)

In [None]:
import networkx as nx
from fa2 import ForceAtlas2
import matplotlib.pyplot as plt

G = nx.Graph()
indices = original_graph._indices().tolist()
edge_num = len(indices[0])
edge_index = [(indices[0][i], indices[1][i]) for i in range(edge_num)]
node_index = list(range(features.shape[0]))
G.add_nodes_from(node_index)
G.add_edges_from(edge_index)


In [None]:

forceatlas2 = ForceAtlas2(
                        # Behavior alternatives
                        outboundAttractionDistribution=True,  # Dissuade hubs
                        linLogMode=False,  # NOT IMPLEMENTED
                        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                        edgeWeightInfluence=1.0,

                        # Performance
                        jitterTolerance=1.0,  # Tolerance
                        barnesHutOptimize=True,
                        barnesHutTheta=1.2,
                        multiThreaded=False,  # NOT IMPLEMENTED

                        # Tuning
                        scalingRatio=2.0,
                        strongGravityMode=False,
                        gravity=1.0,

                        # Log
                        verbose=True)

positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=500)
nx.draw_networkx_nodes(G, positions, node_size=20, with_labels=False, node_color="blue", alpha=0.4)
nx.draw_networkx_edges(G, positions, edge_color="green", alpha=0.05)
plt.axis('off')
plt.show()

# equivalently
#import igraph
#G = igraph.Graph.TupleList(G.edges(), directed=False)
#layout = forceatlas2.forceatlas2_igraph_layout(G, pos=None, iterations=2000)
#igraph.plot(G, layout).show()

In [None]:
print(positions)

In [None]:
newlist = list(positions.keys())
newlist.sort()
newPos = []
for i in range(len(newlist)):
    newPos.append([positions[i][0], positions[i][1]])

In [None]:
len(newlist)

In [None]:
import pickle as pkl
with open("../data/{}/{}_layout.pkt".format(name,name),"wb") as f:
    pkl.dump(newPos, f)

In [None]:
import pickle as pkl
with open("../data/{}/{}_layout.pkt".format(name,name),"rb") as f:
    newPos = pkl.load(f)
    print(newPos)

In [None]:
import math
import torch
import torch.nn as nn 
from torch.nn.parameter import Parameter
import torch.nn.functional as F
from torch.nn.modules.module import Module
from torch.autograd import Variable
import numpy as np
import skcuda.linalg as sklin
from layer import GCN_layer
from utils import accuracy
'''
GCN_layer(ind,outd,bias=True)
'''

class GCN_hook(nn.Module):
    def __init__(self, num_feature,num_hidden,num_class,dropout,bias=True):
        super(GCN_hook,self).__init__()

        self.gc1 = GCN_layer(num_feature, num_hidden)
        self.gc2 = GCN_layer(num_hidden, num_class)
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x1 = F.dropout(x, self.dropout, training=self.training)
        x2 = self.gc2(x1, adj)
        return F.log_softmax(x2, dim=1), x1

In [None]:
name2 = "photo"

In [None]:
model_path = "../models/gcn_{}_state.pkt".format(name2)
#torch.save(model.state_dict(), model_path)
#args_input = [3703,16,6,0.5]
#args_input = [500, 16, 3, 0.5]
#args_input = [2879, 16, 7, 0.5]
#args_input = [1222, 16, 2, 0.5]
args_input = [745, 16, 8, 0.5]
kwargs = {
    "bias": True
}
model = GCN_hook(*args_input,**kwargs)
model.load_state_dict(torch.load(model_path, map_location='cpu'))

model.eval()
output, inner_state = model(features,L)
loss_test = F.nll_loss(output[idx_test],labels[idx_test])
acc_test = accuracy(output[idx_test],labels[idx_test])
print(loss_test.item(), acc_test.item())


In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import time
import pickle as pkl

def dimension_reduction(input_array):
    start_time = time.time()
    X = np.array(input_array)    
    X_embedded = TSNE(n_components=2).fit_transform(X)
    print(time.time() - start_time)
    return X_embedded
def visualize(embedded_array,labels):
    
    #sns.set_palette(sns.color_palette("Paired"))
    X = np.array(embedded_array)
    labels = np.array(labels)
    labels = np.expand_dims(labels, axis=1)
    data = np.concatenate((X, labels), axis=1)
    df = pd.DataFrame(data, columns=["x", "y","Labels"])
    # Create an array with the colors you want to use
    colors = ["#FF0B04", "#4374B3"]
    # Set your custom color palette
    customPalette = sns.set_palette(sns.color_palette(colors))
    ax = sns.scatterplot(x="x", y="y",hue="Labels", data=df, palette="Set1", legend=False)
    
current_palette = sns.color_palette()
sns.palplot(current_palette)



In [None]:
## Input Layer
features_array = features.cpu().detach().numpy()
features_embeddded_array = dimension_reduction(features_array)
visualize(features_embeddded_array, labels.cpu().detach().numpy())
with open("../data/{}/{}_tsne_input.pkt".format(name,name),"wb") as f:
    pkl.dump(features_embeddded_array, f)
    print("Done!")
    

In [None]:
## Hidden Layer
labels_array = labels.cpu().detach().numpy()
layer2 = inner_state
layer2_array = layer2.cpu().detach().numpy()
layer2_embeddded_array = dimension_reduction(layer2_array)
visualize(layer2_embeddded_array, labels.cpu().detach().numpy())
with open("../data/{}/{}_tsne_hidden.pkt".format(name,name),"wb") as f:
    pkl.dump(layer2_embeddded_array, f)
    print("Done!")

In [None]:
## Output Layer
input_array = output.cpu().detach().numpy()
labels_array = labels.cpu().detach().numpy()
embedded_array = dimension_reduction(input_array)
visualize(embedded_array, labels.cpu().detach().numpy())
with open("../data/{}/{}_tsne_output.pkt".format(name,name),"wb") as f:
    pkl.dump(embedded_array, f)
    print("Done!")