In [1]:
import pickle
import os
import glob
import networkx as nx
import numpy as np
import torch.utils.data
import torch
from torch_geometric.data import Data
from tqdm import tqdm
from dataset import graph_algorithms

In [2]:
def to_categorical(x, N):
    v = np.zeros(N)
    v[x] = 1
    return v

In [8]:
def generate_multitask_graph_data(data_path, split, meta_data_path):
    data_files = sorted(glob.glob(os.path.join(data_path, split, '*.p')))
    path = os.path.join('data_temp/multitask_meta/', meta_data_path, split)
    path = str(path)
    if not os.path.exists(path):
        os.makedirs(path) 
    for idx, file_name in tqdm(enumerate(data_files)):
        npr = np.random.RandomState(seed=idx)
        graph_data = pickle.load(open(os.path.join(file_name), "rb"))
        dense_J = graph_data['J'].todense()
        G = nx.from_numpy_array(dense_J)
        A = nx.to_numpy_array(G, weight=None)
        A = np.array(A)
        num_nodes_I = A.shape[0]

        # Input feature => size = (|V|, 2)
        node_values = npr.uniform(low=0, high=1, size=num_nodes_I)  # i.i.d ~ u(0, 1)
        source_node = npr.randint(0, num_nodes_I)  # single source to calculate the shortest path

        # target_node_label (the shortest path, eccentricity, graph_laplacian_features) => size = (|V|, 3)
        sssp = graph_algorithms.all_pairs_shortest_paths(A, 0)[source_node]
        graph_laplacian = graph_algorithms.graph_laplacian_features(A, node_values)
        eccentricity = graph_algorithms.eccentricity(A)
        labels = [sssp, graph_laplacian, eccentricity]
        node_label = np.swapaxes(np.stack(labels), 0, 1)


        # target_graph_label (is_connected, diameter, spectral_radius) => size = 3
        is_connected = graph_algorithms.is_connected(A)
        diameter = graph_algorithms.diameter(A)
        spectral_radius = graph_algorithms.spectral_radius(A)
        labels = [is_connected, diameter, spectral_radius]
        graph_labels = np.asarray(labels).flatten()

        # concatenation between one hot vector which represents source node and i.i.d node values
        features = np.stack([to_categorical(source_node, num_nodes_I), node_values], axis=1)

        # variables from numpy to Pytorch tensor
        features = torch.from_numpy(np.asarray(features)).float()
        node_labels = torch.from_numpy(np.asarray(node_label)).float()
        graph_labels = torch.from_numpy(np.asarray(graph_labels)).float()
        edge_index = torch.tensor(graph_data['msg_node']).t().contiguous().long()
        data = Data(x=features, edge_index=edge_index, y=(node_labels, graph_labels), adj=A, graph=G)
        torch.save(data,
                   os.path.join('data_temp/multitask_meta/', meta_data_path, split,
                   f'{meta_data_path}_{idx}.pt'))
        
def generate_multitask_graph_testset(root):
    data_files = sorted(glob.glob(os.path.join(root, '*.p')))

    for idx, file_name in tqdm(enumerate(data_files)):
        npr = np.random.RandomState(seed=idx+43212)
        graph_data = pickle.load(open(file_name, "rb"))
        dense_J = graph_data['J'].todense()
        G = nx.from_numpy_array(dense_J)
        A = nx.to_numpy_array(G, weight=None)
        A = np.array(A)
        num_nodes_I = A.shape[0]

        # Input feature => size = (|V|, 2)
        node_values = npr.uniform(low=0, high=1, size=num_nodes_I)  # i.i.d ~ u(0, 1)
        source_node = npr.randint(0, num_nodes_I)  # single source to calculate the shortest path

        # target_node_label (the shortest path, eccentricity, graph_laplacian_features) => size = (|V|, 3)
        sssp = graph_algorithms.all_pairs_shortest_paths(A, 0)[source_node]
        graph_laplacian = graph_algorithms.graph_laplacian_features(A, node_values)
        eccentricity = graph_algorithms.eccentricity(A)
        labels = [sssp, graph_laplacian, eccentricity]
        node_label = np.swapaxes(np.stack(labels), 0, 1)

        # target_graph_label (is_connected, diameter, spectral_radius) => size = 3
        is_connected = graph_algorithms.is_connected(A)
        diameter = graph_algorithms.diameter(A)
        spectral_radius = graph_algorithms.spectral_radius(A)
        labels = [is_connected, diameter, spectral_radius]
        graph_labels = np.asarray(labels).flatten()

        # concatenation between one hot vector which represents source node and i.i.d node values
        features = np.stack([to_categorical(source_node, num_nodes_I), node_values], axis=1)

        # variables from numpy to Pytorch tensor
        features = torch.from_numpy(np.asarray(features)).float()
        node_labels = torch.from_numpy(np.asarray(node_label)).float()
        graph_labels = torch.from_numpy(np.asarray(graph_labels)).float()
        edge_index = torch.tensor(graph_data['msg_node']).t().contiguous().long()
        data = Data(x=features, edge_index=edge_index, y=(node_labels, graph_labels), adj=A, graph=G
                    , name=graph_data['name'])
        torch.save(data,
                   os.path.join('data_temp/multitask/test',
                   f'data_{idx}.pt'))

In [None]:
import multiprocessing
path = ['data/exp2_train_100_0.3_meta/meta_group_1',
        'data/exp2_train_100_0.3_meta/meta_group_2',
        'data/exp2_train_100_0.3_meta/meta_group_3',
        'data/exp2_train_100_0.3_meta/meta_group_4',
        'data/exp2_train_100_0.3_meta/meta_group_5']
def generation(data_path):
    meta_data_path = data_path.split('/')[-1]
    generate_multitask_graph_data(data_path, 'train', meta_data_path)
    generate_multitask_graph_data(data_path, 'val', meta_data_path)
pool_obj = multiprocessing.Pool()
pool_obj.map(generation, path)

15it [01:02,  4.26s/it]