In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
from core.SimpleNet import SimpleNet
from core.OldGraphConvNet import OldGraphConvNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import graph_trustworthiness, trustworthiness
from util.network_utils import get_net_projection
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling

In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    dtypeFloat = torch.cuda.FloatTensor
    dtypeLong = torch.cuda.LongTensor
    print('cuda available')
    device = 'cuda'
else:
    dtypeFloat = torch.FloatTensor
    dtypeLong = torch.LongTensor
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'cora_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)
dataset.summarise()

Time to create all data (s) = 0.0346
Name of dataset = cora_full
Input dimension = 1433
Number of training samples = 2708
Training labels = True


In [7]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['L'] = 2 # number of hidden layers

In [58]:
# Graph net
net_parameters['H'] = 64 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/cora_full_1/'
filename = root + 'graph_net_800.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [9]:
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)

Time to create all data (s) = 0.0132


In [59]:
y_pred_1 = get_net_projection(dataset.all_data, net_1)

In [60]:
plot_graph_embedding(y_pred_1, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [12]:
# Simple net
net_2 = SimpleNet(net_parameters)
if torch.cuda.is_available(): net_2.cuda()
root = 'results/cora_full_3/'
filename = root + 'simple_net_3000.pkl'
checkpoint = torch.load(filename, map_location=device)
net_2.load_state_dict(checkpoint['state_dict'])

In [13]:
y_pred_2 = get_net_projection(dataset.all_data, net_2)

In [14]:
plot_graph_embedding(y_pred_2, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [15]:
def apply_mask(inputs, labels, adj, mask):
    return inputs[mask], labels[mask], adj[mask,:][:,mask]

In [16]:
all_indices = np.arange(0, len(dataset.labels))
np.random.shuffle(all_indices)
n = len(dataset.labels)
n_train = int(n * 0.6)
train_mask = all_indices[:n_train]
test_mask = all_indices[n_train:]
print(len(train_mask), len(test_mask))

1624 1084


In [17]:
from util.evaluation_metrics import run_regression

In [61]:
train_embeds = y_pred_1[train_mask]
train_labels = dataset.labels[train_mask]
test_embeds = y_pred_1[test_mask]
test_labels = dataset.labels[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.5940959409594095
Random baseline f1 score: 0.17435424354243542




In [19]:
train_embeds = y_pred_2[train_mask]
test_embeds = y_pred_2[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)



F1 score: 0.6116236162361623
Random baseline f1 score: 0.17435424354243542


In [20]:
from util.network_utils import get_net_embeddings

In [62]:
y_pred_3 = get_net_embeddings(dataset.all_data, net_1, net_type='graph', H=64)

In [63]:
y_pred_3.shape

torch.Size([2708, 64])

In [64]:
train_embeds = y_pred_3[train_mask]
test_embeds = y_pred_3[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6743542435424354
Random baseline f1 score: 0.17435424354243542




In [65]:
embedder = TSNE(n_components=2, method='exact', perplexity=30, verbose=1)
y_half_graph_tsne = embedder.fit_transform(y_pred_3)
plot_graph_embedding(y_half_graph_tsne, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 2708
[t-SNE] Computed conditional probabilities for sample 2000 / 2708
[t-SNE] Computed conditional probabilities for sample 2708 / 2708
[t-SNE] Mean sigma: 1.692606
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.670061
[t-SNE] KL divergence after 1000 iterations: 0.872550


In [57]:
y_pred_4 = get_net_embeddings(dataset.all_data, net_2, net_type='simple', H=2000)

In [25]:
train_embeds = y_pred_4[train_mask]
test_embeds = y_pred_4[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.4123616236162362
Random baseline f1 score: 0.17435424354243542




In [26]:
raw_features = dataset.inputs.toarray()
raw_features.shape

(2708, 1433)

In [27]:
train_embeds = raw_features[train_mask]
test_embeds = raw_features[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)



F1 score: 0.7121771217712177
Random baseline f1 score: 0.17435424354243542


In [28]:
path_matrix = get_shortest_path_matrix(dataset.adj_matrix.toarray())
path_matrix.shape

Computing all pairs shortest path lengths for 2708 nodes...
Time to compute shortest paths (s) = 24.5688


(2708, 2708)

In [29]:
embedder = TSNE(n_components=64, metric='precomputed', method='exact', perplexity=30, verbose=1)
y_tsne = embedder.fit_transform(path_matrix)

[t-SNE] Computed conditional probabilities for sample 1000 / 2708
[t-SNE] Computed conditional probabilities for sample 2000 / 2708
[t-SNE] Computed conditional probabilities for sample 2708 / 2708
[t-SNE] Mean sigma: 0.605159
[t-SNE] KL divergence after 250 iterations with early exaggeration: 87.762028
[t-SNE] KL divergence after 650 iterations: 6.541208


In [30]:
train_embeds = y_tsne[train_mask]
test_embeds = y_tsne[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.5479704797047971
Random baseline f1 score: 0.17435424354243542




In [66]:
y_combined = np.hstack((raw_features, y_pred_3))
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.7610701107011071
Random baseline f1 score: 0.17435424354243542




In [67]:
y_combined = np.hstack((raw_features, y_tsne))
y_combined.shape

(2708, 1435)

In [68]:
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.507380073800738
Random baseline f1 score: 0.17435424354243542




In [33]:
with open(input_dir+'/cora_full/cora_deepwalk.pkl', 'rb') as f:
    y_deepwalk = pickle.load(f)

In [34]:
y_deepwalk.shape

(2708, 64)

In [35]:
train_embeds = y_deepwalk[train_mask]
test_embeds = y_deepwalk[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)



F1 score: 0.7204797047970478
Random baseline f1 score: 0.17435424354243542


In [36]:
y_combined = np.hstack((raw_features, y_deepwalk))
y_combined.shape

(2708, 1497)

In [37]:
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)



F1 score: 0.8035055350553506
Random baseline f1 score: 0.17435424354243542


In [38]:
# Graph net
net_parameters['H'] = 128 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/cora_full_4/'
filename = root + 'graph_net_800.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [39]:
y_pred_1 = get_net_projection(dataset.all_data, net_1)
plot_graph_embedding(y_pred_1, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [40]:
y_pred_5 = get_net_embeddings(dataset.all_data, net_1, net_type='graph', H=128)

In [41]:
train_embeds = y_pred_5[train_mask]
test_embeds = y_pred_5[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.4621771217712177
Random baseline f1 score: 0.17435424354243542




In [54]:
embedder = TSNE(n_components=2, method='exact', perplexity=30, verbose=1)
y_pure_graph_tsne = embedder.fit_transform(y_pred_5)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 2708
[t-SNE] Computed conditional probabilities for sample 2000 / 2708
[t-SNE] Computed conditional probabilities for sample 2708 / 2708
[t-SNE] Mean sigma: 2.045090
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.719667
[t-SNE] KL divergence after 1000 iterations: 0.764496


In [55]:
plot_graph_embedding(y_pure_graph_tsne, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [42]:
# Simple net
net_2 = SimpleNet(net_parameters)
if torch.cuda.is_available(): net_2.cuda()
root = 'results/cora_full_5/'
filename = root + 'simple_net_3000.pkl'
checkpoint = torch.load(filename, map_location=device)
net_2.load_state_dict(checkpoint['state_dict'])

In [43]:
y_pred_2 = get_net_projection(dataset.all_data, net_2)
plot_graph_embedding(y_pred_2, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [44]:
y_combined_2 = np.hstack((raw_features, y_pred_5))
y_combined_2.shape

(2708, 1561)

In [45]:
train_embeds = y_combined_2[train_mask]
test_embeds = y_combined_2[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6217712177121771
Random baseline f1 score: 0.17435424354243542




In [46]:
from scipy.sparse.csgraph import connected_components
n_connected, _ = connected_components(dataset.adj_matrix)
n_connected

78

In [47]:
embedder = TSNE(n_components=2, method='exact', perplexity=30, verbose=1)
y_deepwalk_tsne = embedder.fit_transform(y_deepwalk)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 2708
[t-SNE] Computed conditional probabilities for sample 2000 / 2708
[t-SNE] Computed conditional probabilities for sample 2708 / 2708
[t-SNE] Mean sigma: 1.725809
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.649081
[t-SNE] KL divergence after 1000 iterations: 0.682444


In [48]:
plot_graph_embedding(y_deepwalk_tsne, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [49]:
embedder = TSNE(n_components=2, metric='precomputed', method='exact', perplexity=30, verbose=1)
y_tsne = embedder.fit_transform(path_matrix)
plot_graph_embedding(y_tsne, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

[t-SNE] Computed conditional probabilities for sample 1000 / 2708
[t-SNE] Computed conditional probabilities for sample 2000 / 2708
[t-SNE] Computed conditional probabilities for sample 2708 / 2708
[t-SNE] Mean sigma: 0.605159
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.570879
[t-SNE] KL divergence after 1000 iterations: 1.202128


In [69]:
# Graph net
net_parameters['H'] = 64 # number of hidden units
net_parameters['n_components'] = 64

net_3 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_3.cuda()
root = 'results/cora_full_8/'
filename = root + 'graph_net_1200.pkl'
checkpoint = torch.load(filename, map_location=device)
net_3.load_state_dict(checkpoint['state_dict'])

In [70]:
y_pred_6 = get_net_projection(dataset.all_data, net_3)

In [71]:
train_embeds = y_pred_6[train_mask]
test_embeds = y_pred_6[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.7158671586715867
Random baseline f1 score: 0.17435424354243542




In [72]:
y_combined = np.hstack((raw_features, y_pred_6))
y_combined.shape

(2708, 1497)

In [73]:
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.7767527675276754
Random baseline f1 score: 0.17435424354243542




In [99]:
# Graph net
net_parameters['H'] = 128 # number of hidden units
net_parameters['n_components'] = 128

net_4 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_4.cuda()
root = 'results/cora_full_9/'
filename = root + 'graph_net_240.pkl'
checkpoint = torch.load(filename, map_location=device)
net_4.load_state_dict(checkpoint['state_dict'])

y_pred_7 = get_net_projection(dataset.all_data, net_4)

In [100]:
train_embeds = y_pred_7[train_mask]
test_embeds = y_pred_7[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8154981549815498
Random baseline f1 score: 0.17435424354243542




In [105]:
# Graph net
net_parameters['H'] = 128 # number of hidden units
net_parameters['n_components'] = 128

net_5 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_5.cuda()
root = 'results/cora_full_10/'
filename = root + 'graph_net_160.pkl'
checkpoint = torch.load(filename, map_location=device)
net_5.load_state_dict(checkpoint['state_dict'])

y_pred_8 = get_net_projection(dataset.all_data, net_5)

In [107]:
train_embeds = y_pred_8[train_mask]
test_embeds = y_pred_8[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8081180811808119
Random baseline f1 score: 0.17435424354243542




# PCA of embeddings
Compare DeepWalk, graph(0.5) and graph(1.0)

In [50]:
from sklearn import decomposition
embedder = decomposition.TruncatedSVD(n_components=2)

In [51]:
pca_deepwalk = embedder.fit_transform(y_deepwalk)
plot_graph_embedding(pca_deepwalk, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [52]:
pca_half_graph = embedder.fit_transform(y_pred_3)
plot_graph_embedding(pca_half_graph, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

In [53]:
pca_pure_graph = embedder.fit_transform(y_pred_5)
plot_graph_embedding(pca_pure_graph, dataset.labels, dataset.adj_matrix, line_alpha=0.1)