In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp data

# Data

> Utilities for retrieving benchmarks and generating random graphs

In [None]:
#export 
from nbdev.showdoc import *
import numpy as np
import networkx as nx
import scipy
import pygsp
from pathlib import Path

## Benchmarks

These datasets are downloaded and preprocessed using https://github.com/shchur/gnn-benchmark. I aim to replace them once I work out how to make gnn-benchmark a dependency

In [None]:
#export
def get_benchmark(dataset):
    """Returns an adjacency matrix `A`, feature matrix `X` and labels `y` from standard benchmark.
    
    The data is normalised as done in `https://github.com/shchur/gnn-benchmark`.
    
    Args:
        `dataset` (string): Can be cora, pubmed, citeseer, amazon_electronics_photo, 
                            amazon_electronics_computers, ms_academic_phy or ms_academic_cs
                            
    Returns:
        `A`: An adjacency matrix in sparse array format  
        `X`: A feature matrix in sparse array format  
        `y`: Labels in list format
    """
    fname = Path(__file__).parents[1].joinpath(f'data/standardised_{dataset}.npz')
    data = np.load(fname, allow_pickle=True)
    A, X, y = data['A'].tolist(), data['X'].tolist(), data['y']
    return A, X, y

## Synethetic

In [None]:
#export

def get_planar_graph(n, return_pos=False):
    """Returns a networkx graph which is planar on n nodes.
    
    The generation process is taken from:
    https://stackoverflow.com/questions/26681899/how-to-make-networkx-graph-from-delaunay-preserving-attributes-of-the-input-node
    
    If `return_pos` a planar embedding is also returned in the form of {node : (x, y)}
    """
    points = np.random.rand(n, 2)
    delTri = scipy.spatial.Delaunay(points)
    edges = set()
    for n in range(delTri.nsimplex):
        edge = sorted([delTri.vertices[n,0], delTri.vertices[n,1]])
        edges.add((edge[0], edge[1]))
        edge = sorted([delTri.vertices[n,0], delTri.vertices[n,2]])
        edges.add((edge[0], edge[1]))
        edge = sorted([delTri.vertices[n,1], delTri.vertices[n,2]])
        edges.add((edge[0], edge[1]))
    graph = nx.Graph(list(edges))
    
    if return_pos:
        pos = dict(zip(range(len(points)), points))
        return graph, pos
    else:
        return graph
    
def get_sensor_graph(n):
    """Returns a networkx graph for a connected KNN sensor graph
    
    This used the github pygsp.graphs.Sensor implementation, not the stable release (i.e. as described in the docs).
    """
    G = pygsp.graphs.Sensor(n)
    while not G.is_connected():
        G = pygsp.graphs.Sensor(n)
    return nx.Graph(G.W)

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_graphtools.ipynb.
Converted 01_sampling.ipynb.
Converted 02_metrics.ipynb.
Converted 03_perturb.ipynb.
Converted 04_plotting.ipynb.
Converted 05_data.ipynb.
Converted index.ipynb.


In [None]:
from grapht.data import get_benchmark
_, _, y = get_benchmark('cora_full')

In [None]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69])