In [1]:
from torch_geometric.datasets import Planetoid, CitationFull, NELL
from Proposed.proposed_dataset import ProposedDataset
import torch_geometric.transforms as T
from torch_geometric.data import InMemoryDataset, Data
import torch


DATASET_ROOT_FOLDER = "Datasets"

def get_citeseer_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "CiteSeer",
                        split='random')
    
    return random_split(dataset)

def get_cora_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "Cora",
                        split='random')
    
    return random_split(dataset)


def get_pubmed_dataset():
    dataset = Planetoid(root = DATASET_ROOT_FOLDER,
                        name= "PubMed",
                        split='random')
    
    return random_split(dataset)


def get_nell_dataset():

    transform = T.RandomNodeSplit(split='random')

    dataset = NELL(root = DATASET_ROOT_FOLDER, transform=transform)

    #data = Data(x=dataset[0].x.to_dense(),
    #            edge_index=dataset[0].edge_index,
    #            y=dataset[0].y)
    
    dataset.x = dataset[0].x.to_dense()
    
    return random_split(dataset)


def get_proposed_dataset():
    dataset = ProposedDataset(root = DATASET_ROOT_FOLDER)
    
    return dataset


def random_split(data, num_train_per_class: int = 20, num_val: int = 500, is_nell=False):
    data.train_mask.fill_(False)
    for c in range(data.num_classes):
        num_train_per_class = int(num_train_per_class * .1) if is_nell else num_train_per_class
        idx = (data.y == c).nonzero(as_tuple=False).view(-1)
        idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
        data.train_mask[idx] = True

    remaining = (~data.train_mask).nonzero(as_tuple=False).view(-1)
    remaining = remaining[torch.randperm(remaining.size(0))]

    data.val_mask.fill_(False)
    data.val_mask[remaining[:num_val]] = True

    data.test_mask.fill_(False)
    data.test_mask[remaining[num_val:]] = True

    return data

In [2]:
dataset = get_nell_dataset()



In [3]:
device = 'cuda'

In [3]:
dataset

NELL()

In [4]:
dataset[0]

Data(x=[65755, 61278, nnz=426664], edge_index=[2, 251550], y=[65755], train_mask=[65755], val_mask=[65755], test_mask=[65755])

In [6]:
dataset[0].x



SparseTensor(row=tensor([    0,     0,     0,  ..., 65752, 65753, 65754]),
             col=tensor([    0,     1,     2,  ..., 61275, 61276, 61277]),
             val=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
             size=(65755, 61278), nnz=426664, density=0.01%)

In [7]:
from torch_sparse import SparseTensor


In [16]:
pd = get_pubmed_dataset()

In [18]:
pd[0]

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])