In [None]:
from gnnbench.data.io import load_dataset
from tqdm.notebook import tqdm
from grapht.metrics import all_pairs_path_length
import numpy as np
import networkx as nx
import pandas as pd
import pickle

# Export Data
Process data using gnnbench and saves it. Eventually will add this functionality to the data submodule

In [None]:
def gnnbench_data(name):
    G = load_dataset(f'data/{name}')
    G.standardize()	
    A, X, y = G.unpack()	
    return A, X, y

In [None]:
datasets = ['cora', 'citeseer', 'pubmed']
for dataset in datasets:
    A, X, y = gnnbench_data(dataset)
    np.savez(f'data/{dataset}_gnnbench', A=A, X=X, y=y)
    
datasets = ['amazon_electronics_photo', 'amazon_electronics_computers']
for dataset in datasets:
    A, X, y = gnnbench_data(dataset)
    np.savez(f'data/{dataset}_gnnbench', A=A, X=X, y=y)
    
datasets = ['ms_academic_cs', 'ms_academic_phy']
for dataset in datasets:
    A, X, y = gnnbench_data(dataset)
    np.savez(f'data/{dataset}_gnnbench', A=A, X=X, y=y)

## Data information

In [None]:
all_datasets = ['cora', 'citeseer', 'pubmed', 'amazon_electronics_photo', 'amazon_electronics_computers', 'ms_academic_cs', 'ms_academic_phy']
data = []
for dataset in all_datasets:
    A, X, y = gnnbench_data(dataset)
    G = nx.from_scipy_sparse_matrix(A)
    n, d = X.shape
    no_classes = len(np.unique(y))
    data.append([dataset, G.number_of_nodes(),  G.number_of_edges(), d, no_classes])
df = pd.DataFrame(data, columns = ['dataset', 'nodes', 'edges', 'signal dimension', 'classes'])
print(df.sort_values('nodes'))

# Linegraph path lengths

This is used to make processing lengths in the linegraph faster by caching all distances. Cora takes about 30s, citeseer 60s and pubmed takes at least 1 hour 36 minutes 

In [None]:
import scipy.sparse as sp
from timebudget import timebudget


In [None]:
dataset = 'cora'
A, X, y = gnnbench_data(dataset)
G = nx.from_scipy_sparse_matrix(A)
G = nx.line_graph(G)
A = nx.to_scipy_sparse_matrix(G)
with timebudget(dataset):
    D = sp.csgraph.dijkstra(A)
    print(D.shape)
np.save(f'data/{dataset}_linegraph_distances', D)

(5069, 5069)
cora took 14.772sec


In [None]:
G.nodes()