In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/leowyy/Desktop/fyp-graph-clustering'

In [2]:
import pickle 
import numpy as np
import networkx as nx
import time
import scipy.sparse as sp

In [3]:
from core.EmbeddingDataSet import EmbeddingDataSet
from util.graph_utils import get_shortest_path_matrix

In [4]:
MAX_DISTANCE = 1e6

def get_shortest_path_matrix(adj, cutoff=MAX_DISTANCE, verbose=0):
    n = adj.shape[0]
    if verbose:
        print("Computing all pairs shortest path lengths for {} nodes...".format(n))
    t_start = time.time()
    G = nx.from_numpy_matrix(adj)
    path_lengths = dict(nx.all_pairs_shortest_path_length(G, cutoff=cutoff))
    path_lengths_matrix = np.array([[path_lengths[i].get(k, cutoff) for k in range(n)] for i in range(n)])

    t_elapsed = time.time() - t_start
    if verbose:
        print("Time to compute shortest paths (s) = {:.4f}".format(t_elapsed))
    return path_lengths_matrix

In [5]:
dataset_name = 'ms_academic'
parent_dir = os.path.abspath('..')
input_dir = os.path.join(parent_dir, 'data')
dataset = EmbeddingDataSet(dataset_name, input_dir, train=True)
dataset.create_all_data(n_batches=1, shuffle=False)
dataset.summarise()

Data blocks of length:  [18333]
Time to create all data (s) = 0.2496
Name of dataset = ms_academic
Input dimension = 6805
Number of training samples = 18333
Training labels = True


In [6]:
dataset.adj_matrix.shape

(18333, 18333)

In [7]:
adj = dataset.adj_matrix.toarray()

In [9]:
cutoff = 4
path_lengths_matrix = get_shortest_path_matrix(adj, cutoff=cutoff, verbose=1)
path_lengths_matrix[path_lengths_matrix==cutoff] = 0
foo = sp.csr_matrix(path_lengths_matrix)
print(foo.nnz/np.prod(path_lengths_matrix.shape))
np.save('data/{}_cutoff_{}.npy'.format(dataset_name, cutoff), foo)

Computing all pairs shortest path lengths for 18333 nodes...
Time to compute shortest paths (s) = 580.8831
0.04759812818066173


In [10]:
cutoff = 5
path_lengths_matrix = get_shortest_path_matrix(adj, cutoff=cutoff, verbose=1)
path_lengths_matrix[path_lengths_matrix==cutoff] = 0
foo = sp.csr_matrix(path_lengths_matrix)
print(foo.nnz/np.prod(path_lengths_matrix.shape))
np.save('data/{}_cutoff_{}.npy'.format(dataset_name, cutoff), foo)

Computing all pairs shortest path lengths for 18333 nodes...
Time to compute shortest paths (s) = 1249.4200
0.22353649017863905


In [19]:
cutoff = MAX_DISTANCE
path_lengths_matrix = get_shortest_path_matrix(adj, cutoff=cutoff, verbose=1)
path_lengths_matrix[path_lengths_matrix==cutoff] = 0
foo = sp.csr_matrix(path_lengths_matrix)
print(foo.nnz/np.prod(path_lengths_matrix.shape))
np.save('data/{}_cutoff_{}.npy'.format(dataset_name, cutoff), foo)

Computing all pairs shortest path lengths for 2708 nodes...
Time to compute shortest paths (s) = 23.2980
0.8418946870043135


In [14]:
cutoff = 8
path_lengths_matrix = get_shortest_path_matrix(adj, cutoff=cutoff, verbose=1)
path_lengths_matrix[path_lengths_matrix==cutoff] = 0
foo = sp.csr_matrix(path_lengths_matrix)
print(foo.nnz/np.prod(path_lengths_matrix.shape))
np.save('data/{}_cutoff_{}.npy'.format(dataset_name, cutoff), foo)

Computing all pairs shortest path lengths for 19717 nodes...
Time to compute shortest paths (s) = 1696.0858
0.7965918589961019


In [54]:
path = "/Users/leowyy/Desktop/data/pubmed_test/pubmed_full_cutoff_5.npy"

In [55]:
foo = np.load(path).item()
foo.shape

(19717, 19717)

In [56]:
bar = sp.csr_matrix(foo)
bar.shape

(19717, 19717)

In [57]:
np.save(path, bar)