In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

In [3]:
from core.SimpleNet import SimpleNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import graph_trustworthiness, trustworthiness, evaluate_net_metrics
from util.network_utils import get_net_projection
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling

In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    print('cuda available')
    device = 'cuda'
else:
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'reddit_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=50, shuffle=False)
dataset.summarise()

Data blocks of length:  [474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 473]
Time to create all data (s) = 0.2812
Name of dataset = reddit_full
Input dimension = 602
Number of training samples = 23699
Training labels = True


In [7]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['H'] = 256 # number of hidden units
net_parameters['L'] = 2 # number of hidden layers

In [104]:
n_plot = 70
D_layers = [-1, 0]
mask = np.random.choice(dataset.all_indices, size=n_plot, replace=False)
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
print(len(mask))
G = GraphDataBlock(inputs, labels, W)

1053


In [105]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_1/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [106]:
y_emb_1 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_1, labels, W, line_alpha=0.1)

In [107]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_2/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [108]:
y_emb_2 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_2, labels, W, line_alpha=0.1)

In [109]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_3/'
filename = root + 'graph_net_1.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [110]:
y_emb_3 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_3, labels, W, line_alpha=0.1)

In [111]:
def combined_metric(y_emb, feature_matrix, W, k=5):
    from sklearn.neighbors import kneighbors_graph
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics.pairwise import pairwise_distances
    
    scaler = StandardScaler()
    z_emb = scaler.fit_transform(y_emb)
    z_dist_matrix = pairwise_distances(z_emb, squared=True)
    
    #feature_dist_matrix = pairwise_distances(feature_matrix, metric='cosine')
    knn_graph = kneighbors_graph(feature_matrix, n_neighbors=k, mode='connectivity', metric='cosine', include_self=False)
    
    loss_1 = np.sum(W.toarray() * z_dist_matrix) / W.getnnz()
    loss_2 = np.sum(knn_graph.toarray() * z_dist_matrix) / knn_graph.getnnz()
    loss = loss_1 + loss_2
    
    print('Graph distances: ', loss_1)
    print('Feature distances: ', loss_2)
    print('Total loss: ', loss)
    
    return loss

In [112]:
_ = combined_metric(y_emb_1, inputs, W, k=5)

Graph distances:  2.137174358257829
Feature distances:  0.28487629093572936
Total loss:  2.4220506491935585


In [113]:
_ = combined_metric(y_emb_2, inputs, W, k=5)

Graph distances:  2.3571823667200964
Feature distances:  0.3278102114990332
Total loss:  2.6849925782191297


In [114]:
_ = combined_metric(y_emb_3, inputs, W, k=5)

Graph distances:  2.7721803047513562
Feature distances:  0.41800464639495016
Total loss:  3.1901849511463065
