In [1]:
from torch_geometric.datasets import Planetoid, CitationFull, NELL
from Proposed.proposed_dataset import ProposedDataset
import torch_geometric.transforms as T
from torch_geometric.data import InMemoryDataset, Data
import torch


DATASET_ROOT_FOLDER = "Datasets"
DATASET_ROOT_FOLDER_FOR_NELL = "Datasets/NELL"

def get_citeseer_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "CiteSeer",
                        split='random')
    
    return random_split(dataset)

def get_cora_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "Cora",
                        split='random')
    
    return random_split(dataset)


def get_pubmed_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "PubMed",
                        split='random')
    
    return random_split(dataset)


def get_nell_dataset():

    transform = T.RandomNodeSplit(split='random')

    dataset = NELL(root = DATASET_ROOT_FOLDER, transform=transform)
    return dataset


def get_in_memeory_nell_dataset():
    dataset = InMemoryNellDataset(root = DATASET_ROOT_FOLDER_FOR_NELL)
    
    return random_split(dataset, is_nell=True)


def get_proposed_dataset():
    dataset = ProposedDataset(root = DATASET_ROOT_FOLDER)
    
    return dataset


def random_split(data, num_train_per_class: int = 20, num_val: int = 500, is_nell=False):
    data.train_mask.fill_(False)
    for c in range(data.num_classes):
        num_train_per_class = 2 if is_nell else num_train_per_class
        idx = (data.y == c).nonzero(as_tuple=False).view(-1)
        idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
        data.train_mask[idx] = True

    remaining = (~data.train_mask).nonzero(as_tuple=False).view(-1)
    remaining = remaining[torch.randperm(remaining.size(0))]

    data.val_mask.fill_(False)
    data.val_mask[remaining[:num_val]] = True

    data.test_mask.fill_(False)
    data.test_mask[remaining[num_val:]] = True

    return data




import torch
from torch_geometric.data import InMemoryDataset, Data


class InMemoryNellDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        #self.load(self.processed_paths[0])
        
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['file.edges', 'file.x']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass

    def process(self):
        
        dataset = get_nell_dataset()
        
        y  = dataset[0].y
        y_unique = y.unique()
        to_be_replaced_with = torch.arange(0, y_unique.shape[0])
        for i in range(y_unique.shape[0]):
            y[y==y_unique[i]] = to_be_replaced_with[i]
        
        
        data = Data(x=dataset[0].x,
                edge_index=dataset[0].edge_index,
                y=y)
        data.train_mask = torch.zeros(dataset[0].num_nodes, dtype=torch.bool)
        data.test_mask = torch.zeros(dataset[0].num_nodes, dtype=torch.bool)
        data.val_mask = torch.zeros(dataset[0].num_nodes, dtype=torch.bool)
        
        
        data_list = [data]

        #self.save(data, self.processed_paths[0])
        torch.save(self.collate(data_list), self.processed_paths[0])

In [50]:
in_memory_nell_dataset = get_in_memeory_nell_dataset()

In [51]:
in_memory_nell_dataset[0]

Data(x=[65755, 61278, nnz=426664], edge_index=[2, 251550], y=[65755], train_mask=[65755], test_mask=[65755], val_mask=[65755])

In [52]:
in_memory_nell_dataset[0].x[1:10].to_dense()

tensor([[1., 4., 0.,  ..., 0., 0., 0.],
        [1., 2., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 2., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.]])

In [53]:
from torch_geometric.utils.convert import to_networkx
import networkx as nx

In [56]:
G = to_networkx(in_memory_nell_dataset[0])



In [57]:
nx.write_edgelist(G, "test.edgelist")

In [24]:
in_memory_nell_dataset[0].x[1] + in_memory_nell_dataset[0].x[2]

SparseTensor(row=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
             col=tensor([   0,    1,    4,    5,    9,   13,   16,   25,   26,   29,   46,   66,
                             67,   69,   71,   79,   85,   90,   92,  107,  133,  172,  187,  277,
                            404,  588,  700,  733,  775,  912,  945, 1005, 1074, 1154, 1226, 1380,
                           1400, 1547, 1607, 1627, 1654, 1662, 1690, 1747, 1749, 1995, 2813, 2916,
                           3156, 3179, 3196, 3342, 3383, 3831, 4276, 4540, 4686, 4843, 4884, 5079]),
             val=tensor([2., 6., 6., 9., 1., 3., 2., 3., 7., 4., 2., 1., 5., 2., 4., 1., 3., 1.,
                           1., 1., 1., 2., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                           1., 1., 1., 1., 1., 3

In [25]:
in_memory_nell_dataset[0].x[[True, True, False, True]]

IndexError: The shape of the mask [4] at index 0 does not match the shape of the indexed tensor [65755] at index 0

In [2]:
from torch_geometric.utils.convert import to_networkx
import networkx as nx

In [3]:
citeseer = get_citeseer_dataset()

In [4]:
G = to_networkx(citeseer[0])

In [6]:
nx.write_edgelist(G, "citeseer.edgelist")