In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
from core.SimpleNet import SimpleNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import graph_trustworthiness, trustworthiness, run_regression
from util.network_utils import get_net_projection, get_net_embeddings
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling

In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    print('cuda available')
    device = 'cuda'
else:
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'citeseer_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)
dataset.summarise()

Time to create all data (s) = 0.1682
Name of dataset = citeseer_full
Input dimension = 3703
Number of training samples = 3312
Training labels = True


In [7]:
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)

Time to create all data (s) = 0.0342


In [8]:
all_indices = np.arange(0, len(dataset.labels))
np.random.shuffle(all_indices)
n = len(dataset.labels)
n_train = int(n * 0.6)
train_mask = all_indices[:n_train]
test_mask = all_indices[n_train:]
print(len(train_mask), len(test_mask))

1987 1325


## Baselines

In [9]:
raw_features = dataset.inputs.toarray()
raw_features.shape

(3312, 3703)

In [10]:
dataset.inputs

<3312x3703 sparse matrix of type '<class 'numpy.float64'>'
	with 105165 stored elements in Compressed Sparse Row format>

In [11]:
raw_features[1][:20]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [12]:
train_embeds = raw_features[train_mask]
train_labels = dataset.labels[train_mask]
test_embeds = raw_features[test_mask]
test_labels = dataset.labels[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6830188679245283
Random baseline f1 score: 0.18566037735849056


In [13]:
with open(input_dir+'/citeseer_full/citeseer_deepwalk_64.pkl', 'rb') as f:
    y_deepwalk = pickle.load(f)

In [14]:
y_deepwalk.shape

(3312, 64)

In [15]:
train_embeds = y_deepwalk[train_mask]
test_embeds = y_deepwalk[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.48452830188679247
Random baseline f1 score: 0.18566037735849056


In [16]:
y_combined = np.hstack((raw_features, y_deepwalk))
y_combined.shape

(3312, 3767)

In [17]:
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.690566037735849
Random baseline f1 score: 0.18566037735849056


In [18]:
with open(input_dir+'/citeseer_full/citeseer_deepwalk_128.pkl', 'rb') as f:
    y_deepwalk_128 = pickle.load(f)

In [19]:
train_embeds = y_deepwalk_128[train_mask]
test_embeds = y_deepwalk_128[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.5207547169811321
Random baseline f1 score: 0.18566037735849056


In [21]:
y_combined = np.hstack((raw_features, y_deepwalk_128))
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6973584905660377
Random baseline f1 score: 0.18566037735849056


# Visualization

In [22]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['L'] = 2 # number of hidden layers

In [23]:
# Graph net
net_parameters['n_components'] = 2
net_parameters['H'] = 128 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/citeseer_full_1/'
filename = root + 'graph_net_240.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [24]:
y_pred_1 = get_net_projection(dataset.all_data, net_1)

In [25]:
plot_graph_embedding(y_pred_1, dataset.labels, dataset.adj_matrix, line_alpha=0.1)

## Trained networks

In [22]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['L'] = 2 # number of hidden layers

In [42]:
# Graph net
net_parameters['n_components'] = 128
net_parameters['H'] = 128 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/citeseer_full_4/'
filename = root + 'graph_net_360.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [43]:
y_pred_1 = get_net_projection(dataset.all_data, net_1)

In [44]:
train_embeds = y_pred_1[train_mask]
test_embeds = y_pred_1[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.5909433962264151
Random baseline f1 score: 0.18566037735849056


In [45]:
y_combined = np.hstack((raw_features, y_pred_1))
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.7011320754716981
Random baseline f1 score: 0.18566037735849056


In [32]:
# Graph net
net_parameters['n_components'] = 256
net_parameters['H'] = 256 # number of hidden units

net_2 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_2.cuda()
root = 'results/pubmed_full_3/'
filename = root + 'graph_net_1.pkl'
checkpoint = torch.load(filename, map_location=device)
net_2.load_state_dict(checkpoint['state_dict'])

In [33]:
y_pred_2 = get_net_projection(dataset.all_data, net_2)

In [34]:
train_embeds = y_pred_2[train_mask]
test_embeds = y_pred_2[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8246481551920882
Random baseline f1 score: 0.3584379358437936


In [35]:
y_combined = np.hstack((raw_features, y_pred_2))
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8496259667807785
Random baseline f1 score: 0.3584379358437936


In [54]:
# Simple net
net_parameters['n_components'] = 128
net_parameters['H'] = 128 # number of hidden units

net_3 = SimpleNet(net_parameters)
if torch.cuda.is_available(): net_3.cuda()
root = 'results/citeseer_full_5/'
filename = root + 'simple_net_400.pkl'
checkpoint = torch.load(filename, map_location=device)
net_3.load_state_dict(checkpoint['state_dict'])

In [55]:
y_pred_3 = get_net_projection(dataset.all_data, net_3)

In [56]:
train_embeds = y_pred_3[train_mask]
test_embeds = y_pred_3[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6158490566037735
Random baseline f1 score: 0.18566037735849056


In [57]:
y_combined = np.hstack((raw_features, y_pred_3))
train_embeds = y_combined[train_mask]
test_embeds = y_combined[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.6709433962264151
Random baseline f1 score: 0.18566037735849056
