In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
from core.SimpleNet import SimpleNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import evaluate_viz_metrics
from util.network_utils import get_net_projection, _get_net_projection
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling

cuda not available
cuda not available


In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    print('cuda available')
    device = 'cuda'
else:
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'pubmed_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)
dataset.summarise()

Data blocks of length:  [19717]
Time to create all data (s) = 0.1006
Name of dataset = pubmed_full
Input dimension = 500
Number of training samples = 19717
Training labels = True


In [7]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['H'] = 50 # number of hidden units
net_parameters['L'] = 2 # number of hidden layers

In [8]:
n_plot = 50
D_layers = [9, 14]
mask = sorted(np.random.choice(dataset.all_indices, size=n_plot, replace=False))
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
print(len(mask))
G = GraphDataBlock(inputs, labels, W)
G.add_original_indices(mask)

1312


In [9]:
# Graph net
net_parameters['H'] = 512 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/pubmed_full_9/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [10]:
y_emb_1 = _get_net_projection(net_1, G, sampling=True, dataset=dataset)
plot_graph_embedding(y_emb_1, labels, W, line_alpha=0.1)

In [11]:
# Graph net
net_parameters['H'] = 512 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/pubmed_full_10/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [12]:
y_emb_2 = _get_net_projection(net_1, G, sampling=True, dataset=dataset)
plot_graph_embedding(y_emb_2, labels, W, line_alpha=0.1)

In [13]:
# Graph net
net_parameters['H'] = 512 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/pubmed_full_11/'
filename = root + 'graph_net_2.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [14]:
y_emb_3 = _get_net_projection(net_1, G, sampling=True, dataset=dataset)
plot_graph_embedding(y_emb_3, labels, W, line_alpha=0.1)

In [15]:
_ = evaluate_viz_metrics(y_emb_1, G)

One NN accuracy = 0.7706
Avg graph distance = 0.1761
Avg feature distance = 0.7007
Total distance = 0.8768


In [16]:
_ = evaluate_viz_metrics(y_emb_2, G)

One NN accuracy = 0.7325
Avg graph distance = 0.6842
Avg feature distance = 0.3518
Total distance = 1.0360


In [17]:
_ = evaluate_viz_metrics(y_emb_3, G)

One NN accuracy = 0.7759
Avg graph distance = 0.1422
Avg feature distance = 0.8082
Total distance = 0.9504


## Archive

In [18]:
assert 1==2

AssertionError: 

In [None]:
# Simple net
net_2 = SimpleNet(net_parameters)
if torch.cuda.is_available(): net_2.cuda()
root = 'results/pubmed_4/'
filename = root + 'simple_net_5.pkl'
checkpoint = torch.load(filename, map_location=device)
net_2.load_state_dict(checkpoint['state_dict'])

In [None]:
y_pred_2 = get_net_projection(dataset.all_data, net_2)

In [None]:
y_emb_2 = get_net_projection([G], net_2)
plot_graph_embedding(y_emb_2, labels, W, line_alpha=0.1)

In [None]:
path_matrix = get_shortest_path_matrix(W.toarray())
path_matrix.shape

In [None]:
embedder = TSNE(n_components=2, metric='precomputed', perplexity=30, verbose=1)
y_pred_tsne = embedder.fit_transform(path_matrix)

In [None]:
plot_graph_embedding(y_pred_tsne, labels, W, line_alpha=0.1)

In [None]:
# embeddings_dict = {r'tsne ($\alpha=0)$': y_pred_tsne,
#                    r'tsne ($\alpha=1)$': y_pred_tsne_2}

In [None]:
embeddings_dict = {r'graph net ($\alpha=0.5$)': y_emb,
                   r'simple net ($\alpha=0.5$)': y_emb_2,
                  'tsne': y_pred_tsne}

In [None]:
def neighborhood_preservation(path_matrix, X_emb, max_graph_dist=2):
    dist_X_emb = pairwise_distances(X_emb, squared=True)
    ind_X_emb = np.argsort(dist_X_emb, axis=1)[:, 1:]

    n_samples = X_emb.shape[0]
    t = 0.0
    for i in range(n_samples):
        graph_n = {k for k, v in enumerate(path_matrix[i]) if 0 < v <= max_graph_dist}
        if len(graph_n) == 0:
            t += 1
            continue
        layout_n = set(ind_X_emb[i][:len(graph_n)])
        intersection_size = len(graph_n.intersection(layout_n))
        # print(len(graph_n), intersection_size)
        t += intersection_size / (2*len(graph_n) - intersection_size)
    return t/n_samples

In [None]:
test_range = list(range(1, 5))
scores = np.zeros(len(test_range))

for label, embedding in embeddings_dict.items():
    for i, k in enumerate(test_range):
        scores[i] = neighborhood_preservation(path_matrix, embedding, max_graph_dist=k)
    plt.plot(test_range, scores, label=label)
        
plt.legend()
plt.xlabel('max_graph_dist')
plt.grid(True)
plt.title('Graph preservation')

In [None]:
test_range = list(range(1, 11))
scores = np.zeros(len(test_range))

D = pairwise_distances(inputs, metric='cosine')

for label, embedding in embeddings_dict.items():
    for i, k in enumerate(test_range):
        scores[i] = trustworthiness(D, embedding, precomputed=True, n_neighbors=k)
    plt.plot(test_range, scores, label=label)
        
plt.legend()
plt.xlabel('nearest neighbors (k)')
plt.grid(True)
plt.title('Feature preservation')