In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

In [3]:
from core.SimpleNet import SimpleNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import graph_trustworthiness, trustworthiness, evaluate_net_metrics
from util.network_utils import get_net_projection
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling

cuda not available
cuda not available


In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    print('cuda available')
    device = 'cuda'
else:
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'reddit_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=2, shuffle=False)
dataset.summarise()

Data blocks of length:  [11850, 11849]
Time to create all data (s) = 0.2879
Name of dataset = reddit_full
Input dimension = 602
Number of training samples = 23699
Training labels = True


In [7]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['H'] = 256 # number of hidden units
net_parameters['L'] = 2 # number of hidden layers

In [8]:
from util.io_utils import unpack_deepwalk_embedding
data_dir = os.path.join(input_dir, dataset_name)
y_deepwalk_256 = unpack_deepwalk_embedding(os.path.join(data_dir, 'reddit_256.embeddings'))

Embedding matrix shape:  (23699, 256)


In [9]:
n_plot = 70
D_layers = [-1, 0]
mask = np.random.choice(dataset.all_indices, size=n_plot, replace=False)
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
print(len(mask))
G = GraphDataBlock(inputs, labels, W)

944


In [10]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_1/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [11]:
y_emb_1 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_1, labels, W, line_alpha=0.1)

In [12]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_2/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [13]:
y_emb_2 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_2, labels, W, line_alpha=0.1)

In [14]:
# Graph net
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_3/'
filename = root + 'graph_net_1.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [15]:
y_emb_3 = get_net_projection([G], net_1)
plot_graph_embedding(y_emb_3, labels, W, line_alpha=0.1)

In [16]:
dw_embed = y_deepwalk_256[mask]
from sklearn.manifold import TSNE
embedder = TSNE(n_components=2, method="exact", perplexity=30, verbose=1)
y_pred_deepwalk = embedder.fit_transform(dw_embed)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 944 / 944
[t-SNE] Mean sigma: 1.586169
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.633468
[t-SNE] KL divergence after 1000 iterations: 0.724327


In [17]:
plot_graph_embedding(y_pred_deepwalk, labels, W, line_alpha=0.1)

In [24]:
path_matrix = get_shortest_path_matrix(W.toarray())
path_matrix.shape

Computing all pairs shortest path lengths for 944 nodes...
Time to compute shortest paths (s) = 2.6759


(944, 944)

In [25]:
from sklearn.manifold import TSNE
embedder = TSNE(n_components=2, metric='precomputed', method="exact", perplexity=30, verbose=1)
y_pred_tsne = embedder.fit_transform(path_matrix)

[t-SNE] Computed conditional probabilities for sample 944 / 944
[t-SNE] Mean sigma: 0.622104
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.568081
[t-SNE] KL divergence after 1000 iterations: 0.756712


In [26]:
plot_graph_embedding(y_pred_tsne, labels, W, line_alpha=0.1)

In [18]:
def combined_metric(y_emb, feature_matrix, W, k=5):
    from sklearn.neighbors import kneighbors_graph
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics.pairwise import pairwise_distances
    
    scaler = StandardScaler()
    z_emb = scaler.fit_transform(y_emb)
    z_dist_matrix = pairwise_distances(z_emb, squared=True)
    
    #feature_dist_matrix = pairwise_distances(feature_matrix, metric='cosine')
    knn_graph = kneighbors_graph(feature_matrix, n_neighbors=k, mode='connectivity', metric='cosine', include_self=False)
    
    loss_1 = np.sum(W.toarray() * z_dist_matrix) / W.getnnz()
    loss_2 = np.sum(knn_graph.toarray() * z_dist_matrix) / knn_graph.getnnz()
    loss = loss_1 + loss_2
    
    print('Graph distances: ', loss_1)
    print('Feature distances: ', loss_2)
    print('Total loss: ', loss)
    
    return loss

In [19]:
_ = combined_metric(y_emb_1, inputs, W, k=5)

Graph distances:  2.630519193295597
Feature distances:  0.36731389742403814
Total loss:  2.997833090719635


In [20]:
_ = combined_metric(y_emb_2, inputs, W, k=5)

Graph distances:  2.6719945181038662
Feature distances:  0.3555289058336372
Total loss:  3.0275234239375033


In [21]:
_ = combined_metric(y_emb_3, inputs, W, k=5)

Graph distances:  3.2233990579666836
Feature distances:  0.5177726871728644
Total loss:  3.741171745139548


In [22]:
_ = combined_metric(y_pred_deepwalk, inputs, W, k=5)

Graph distances:  0.18388675019721396
Feature distances:  2.6585565803569575
Total loss:  2.8424433305541714


In [27]:
path_matrix[:10,:][:,:10]

array([[0., 6., 4., 5., 5., 6., 6., 2., 6., 5.],
       [6., 0., 8., 6., 5., 7., 7., 5., 7., 8.],
       [4., 8., 0., 8., 6., 8., 8., 3., 8., 8.],
       [5., 6., 8., 0., 4., 5., 5., 5., 6., 5.],
       [5., 5., 6., 4., 0., 6., 5., 3., 7., 6.],
       [6., 7., 8., 5., 6., 0., 4., 5., 2., 6.],
       [6., 7., 8., 5., 5., 4., 0., 5., 5., 6.],
       [2., 5., 3., 5., 3., 5., 5., 0., 5., 5.],
       [6., 7., 8., 6., 7., 2., 5., 5., 0., 7.],
       [5., 8., 8., 5., 6., 6., 6., 5., 7., 0.]])

In [28]:
23000/250

92.0

In [30]:
n_plot = 92
D_layers = [9, 14]
mask = np.random.choice(dataset.all_indices, size=n_plot, replace=False)
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
len(mask)

4985

In [31]:
path_matrix = get_shortest_path_matrix(W.toarray())
path_matrix.shape

Computing all pairs shortest path lengths for 4985 nodes...
Time to compute shortest paths (s) = 198.6134


(4985, 4985)

In [32]:
23000/2000

11.5

In [45]:
n_plot = 12
D_layers = [9, 14]
mask = np.random.choice(dataset.all_indices, size=n_plot, replace=False)
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
len(mask)

716

In [46]:
path_matrix = get_shortest_path_matrix(W.toarray())
path_matrix.shape

Computing all pairs shortest path lengths for 716 nodes...
Time to compute shortest paths (s) = 2.1024


(716, 716)

In [42]:
path_matrix

array([[0, 3, 4, ..., 4, 4, 3],
       [3, 0, 3, ..., 3, 3, 2],
       [4, 3, 0, ..., 3, 2, 2],
       ...,
       [4, 3, 3, ..., 0, 4, 1],
       [4, 3, 2, ..., 4, 0, 3],
       [3, 2, 2, ..., 1, 3, 0]])