In [1]:
import sys
sys.path.insert(0, '../src/')

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import scipy.sparse as sp
from scipy.sparse import load_npz

import torch

import utils
from net import start_experiments

In [270]:
_A_obs = load_npz('../data/datasets/CORA-ML.npz')

In [279]:
val_share = 0.1
test_share = 0.05
seed = 481516234

train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False)

train_graph = sp.csr_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1])))
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [273]:
import networkx as nx
def train_val_test_split(A, val_share, test_share, seed=123):
    np.random.seed(seed)
    G = nx.from_scipy_sparse_matrix(A)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    
    # Test symmetric, connected and has sufficiently many edges
    assert (abs(A-A.T)>1e-10).nnz == 0, 'Graph is not symmetric.'
    assert nx.is_connected(G), 'Graph is not connected.'
    assert num_edges - num_nodes > (val_share + test_share) * num_edges + 1, 'Val- and test-share are too large.'    
    
    # Ensure that train graph is symmetric by protecting certain edges
    # Split edges into val-, test- and training-set
    protected_edges = list(nx.minimum_spanning_tree(G).edges())
    edges_set = set(G.edges())
    free_edges = list(edges_set - set(protected_edges))
    np.random.shuffle(free_edges)
    num_val = int(val_share * num_edges)
    num_test = int(test_share * num_edges)
    val_ones = np.array(free_edges[:num_val])
    test_ones = np.array(free_edges[num_val:num_val+num_test])
    train_edges = free_edges[num_val+num_test:] + protected_edges

    G_train = nx.Graph()
    G_train.add_nodes_from(G)
    G_train.add_edges_from(train_edges)
    train_graph = nx.to_scipy_sparse_matrix(G_train)
    
    # Draw non-edges from input graph: draw random tuples, remove direction, loops, and input edges
    non_edges = np.random.choice(num_nodes, size=(2*(num_val+num_test), 2))
    non_edges = np.sort(non_edges[non_edges[:, 0] != non_edges[:, 1]]) # Remove loops and direction
    non_edges = np.unique(non_edges, axis=0) # Remove multiple edges
    non_edges = [tuple(edge) for edge in non_edges if tuple(edge) not in edges_set] # Remove input edges
    np.random.shuffle(non_edges)
    assert len(non_edges)>= num_val + num_test, 'Too few non-zero edges.'
    val_zeros = np.array(non_edges[:num_val])
    test_zeros = np.array(non_edges[num_val:num_val+num_test])
    return train_graph, val_ones, val_zeros, test_ones, test_zeros

In [277]:
train_graph, val_ones, val_zeros, test_ones, test_zeros = train_val_test_split(A=_A_obs, val_share=val_share, test_share=test_share, seed=seed)