In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import pickle 
import numpy as np
import pandas as pd
import torch
import os
from timeit import default_timer as timer
import networkx as nx
import scipy.sparse as sp

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
from core.SimpleNet import SimpleNet
from core.GraphConvNet import GraphConvNet
from core.EmbeddingDataSet import EmbeddingDataSet
from core.GraphDataBlock import GraphDataBlock
from util.plot_graph_embedding import plot_graph_embedding
from util.evaluation_metrics import graph_trustworthiness, trustworthiness, run_regression
from util.network_utils import get_net_projection, get_net_embeddings
from util.graph_utils import get_shortest_path_matrix, neighbor_sampling
from util.io_utils import unpack_deepwalk_embedding

cuda not available
cuda not available


In [4]:
from bokeh.io import output_notebook
output_notebook()

In [5]:
if torch.cuda.is_available():
    print('cuda available')
    device = 'cuda'
else:
    print('cuda not available')
    device = 'cpu'

cuda not available


In [6]:
dataset_name = 'reddit_full'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)
dataset.summarise()

Data blocks of length:  [23699]
Time to create all data (s) = 0.2065
Name of dataset = reddit_full
Input dimension = 602
Number of training samples = 23699
Training labels = True


In [7]:
all_indices = np.arange(0, len(dataset.labels))
np.random.shuffle(all_indices)
n = len(dataset.labels)
n_train = int(n * 0.6)
train_mask = all_indices[:n_train]
test_mask = all_indices[n_train:]
print(len(train_mask), len(test_mask))

14219 9480


## Baselines

In [8]:
raw_features = dataset.inputs.toarray()
raw_features.shape

(23699, 602)

In [9]:
dataset.inputs

<23699x602 sparse matrix of type '<class 'numpy.float64'>'
	with 14213802 stored elements in Compressed Sparse Row format>

In [10]:
train_feats = raw_features[train_mask]
train_labels = dataset.labels[train_mask]
test_feats = raw_features[test_mask]
test_labels = dataset.labels[test_mask]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_feats)
z_train_feats = scaler.transform(train_feats)
z_test_feats = scaler.transform(test_feats)
        
run_regression(z_train_feats, train_labels, z_test_feats, test_labels)

F1 score: 0.47763713080168774
Random baseline f1 score: 0.04947257383966245


In [11]:
data_dir = os.path.join(input_dir, dataset_name)

In [12]:
y_deepwalk_256 = unpack_deepwalk_embedding(os.path.join(data_dir, 'reddit_256.embeddings'))

Embedding matrix shape:  (23699, 256)


In [13]:
train_embeds = y_deepwalk_256[train_mask]
test_embeds = y_deepwalk_256[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8780590717299578
Random baseline f1 score: 0.04947257383966245


In [14]:
train_embeds = np.hstack((train_feats, train_embeds))
test_embeds = np.hstack((test_feats, test_embeds))
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.8879746835443038
Random baseline f1 score: 0.04947257383966245


## Viz DeepWalk

In [15]:
n_plot = 50
D_layers = [-1, 0]
mask = np.random.choice(dataset.all_indices, size=n_plot, replace=False)
mask = neighbor_sampling(dataset.adj_matrix, mask, D_layers)
inputs = dataset.inputs[mask]
labels = dataset.labels[mask]
W = dataset.adj_matrix[mask, :][:, mask]
print(len(mask))
G = GraphDataBlock(inputs, labels, W)

1506


In [17]:
dw_embed = y_deepwalk_256[mask]

In [16]:
from sklearn.manifold import TSNE

In [18]:
embedder = TSNE(n_components=2, method="exact", perplexity=30, verbose=1)
y_pred_deepwalk = embedder.fit_transform(dw_embed)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 1506
[t-SNE] Computed conditional probabilities for sample 1506 / 1506
[t-SNE] Mean sigma: 1.454297
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.757841
[t-SNE] KL divergence after 1000 iterations: 0.996420


In [19]:
plot_graph_embedding(y_pred_deepwalk, labels, W, line_alpha=0.1)

## Trained networks

In [15]:
dataset.create_all_data(n_batches=2, shuffle=False)

Data blocks of length:  [11850, 11849]
Time to create all data (s) = 0.1988


In [16]:
net_parameters = {}
net_parameters['n_components'] = 2
net_parameters['D'] = dataset.input_dim # input dimension
net_parameters['L'] = 2 # number of hidden layers

In [17]:
# Graph net
net_parameters['n_components'] = 256
net_parameters['H'] = 256 # number of hidden units

net_1 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_1.cuda()
root = 'results/reddit_full_4/'
filename = root + 'graph_net_1.pkl'
checkpoint = torch.load(filename, map_location=device)
net_1.load_state_dict(checkpoint['state_dict'])

In [18]:
y_pred_1 = get_net_projection(dataset.all_data, net_1)

In [19]:
train_embeds = y_pred_1[train_mask]
test_embeds = y_pred_1[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.31518987341772153
Random baseline f1 score: 0.051582278481012656


In [20]:
train_embeds = np.hstack((train_feats, train_embeds))
test_embeds = np.hstack((test_feats, test_embeds))
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.32162447257383964
Random baseline f1 score: 0.051582278481012656


In [38]:
# Graph net
net_parameters['n_components'] = 256
net_parameters['H'] = 512 # number of hidden units

net_2 = GraphConvNet(net_parameters)
if torch.cuda.is_available(): net_2.cuda()
root = 'results/reddit_full_5/'
filename = root + 'graph_net_1.pkl'
checkpoint = torch.load(filename, map_location=device)
net_2.load_state_dict(checkpoint['state_dict'])

In [39]:
y_pred_2 = get_net_projection(dataset.all_data, net_2)

In [40]:
train_embeds = y_pred_2[train_mask]
test_embeds = y_pred_2[test_mask]
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.29377637130801687
Random baseline f1 score: 0.051582278481012656


In [41]:
train_embeds = np.hstack((train_feats, train_embeds))
test_embeds = np.hstack((test_feats, test_embeds))
run_regression(train_embeds, train_labels, test_embeds, test_labels)

F1 score: 0.32573839662447257
Random baseline f1 score: 0.051582278481012656
