In [14]:
import os
from tqdm import tqdm
import time
import pickle
from torch_geometric.data import Data, HeteroData

In [3]:
data_dir = f'data/splits'
toy1 = f'{data_dir}/toy.m.undir.mean.data.pkl'
toy2 = f'{data_dir}/toy.stm.undir.mean.data.pkl'

In [4]:
# load file
def load_file(filepath):
    if os.path.exists(filepath):
        with open(filepath, 'rb') as f:
            return pickle.load(f)
    else:
        print(f'Error! {filepath} not found!')

In [15]:
def get_hm_graph_stats(g):
    print(f'\n------------- Homogeneous Graph Stats --------------\n')

    print(f'Type of graph : {"Undirected" if g.is_undirected else "Directed"}')

    print(f"\nNumber of nodes : {g.num_nodes}")
    print(f"Number of edges : {g.num_edges}")

    print(f'\nEdge List : \n{g.edge_index.t().contiguous()}')
        
    print(f'\n------------------------------\n')

In [16]:
def get_ht_graph_stats(g):
    print(f'\n------------- Heterogeneous Graph Stats --------------\n')
    node_types = g.node_types
    edge_types = g.edge_types
    print(f'Node Types : \n{node_types}\n')
    print(f'Edge Types : ')
    for edge_type in edge_types : 
        print(f'{edge_type}')
    
    print("\nNode Types:\n")
    for node_type in node_types:
        num_nodes = g[node_type].num_nodes
        print(f"Number of nodes in '{node_type}': {num_nodes}")
    
    print("\nEdge Types:\n")
    for edge_type in edge_types:
        num_edges = g[edge_type].num_edges
        print(f"Number of edges in '{edge_type}': {num_edges}")
    print(f'\n------------------------------\n')

In [5]:
homogeneous = load_file(toy1)
heterogeneous = load_file(toy2)

### Show the Difference between Data and HeteroData objects

In [5]:
print(homogeneous)

Data(x=[13, 1], edge_index=[2, 28], edge_attr=[28])


In [29]:
print(heterogeneous)

HeteroData(
  member={ x=[13, 1] },
  team={ x=[31, 1] },
  skill={ x=[10, 1] },
  (skill, to, team)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (member, to, team)={
    edge_index=[2, 68],
    edge_attr=[68],
  },
  (team, rev_to, skill)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (team, rev_to, member)={
    edge_index=[2, 68],
    edge_attr=[68],
  }
)


# Random Node Split

Performs a node-level random split by adding train_mask, val_mask and test_mask attributes to the Data or HeteroData object (functional name: **random_node_split**)
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.RandomNodeSplit.html


### split = 'random'
train, validation, and test sets will be randomly generated

#### split = 'random', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.1
random allows to control the train, test and val separately <br>
num_train_per_class = 1 gives 1 node for each class to the train split <br>
num_val = 0.1 gives (0.1 * total number of nodes) to the val split which are not already in the train split <br>
num_test = 0.1 gives (0.1 * total number of nodes) to the test split (taking from (data - (train + val))) <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>

#### split = 'random', num_splits = 1, num_train_per_class = 2, num_val = 0.3, num_test = 0.5
random allows to control the train, test and val separately <br>
num_train_per_class = 1 gives 1 node for each class to the train split <br>
num_val = 0.3 gives (0.3 * total number of nodes) to the val split which are not already in the train split <br> 
num_test = 0.5 gives (0.5 * total number of nodes) to the test split (taking from (data - (train + val))) <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>


### split = 'train_rest'
all nodes except those in the validation and test sets will be used for training

#### split = 'train_rest', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.1
num_val = 0.1 gives (0.1 * total number of nodes) to the val split <br>
num_test = 0.1 gives (0.1 * total number of nodes) to the test split <br>
**num_train_per_class** = regardless of this number, the entire nodes apart from the val and test split will be given to train split <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>

#### split = 'train_rest', num_splits = 1, num_train_per_class = 5, num_val = 0.8, num_test = 0.8
**Who gets first???** <br>
**val split first, then test, then train** <br>
num_val = 0.8 gives (0.8 * total number of nodes) to the val split which are not already in the train split <br>
num_test = 0.5 gives (0.5 * total number of nodes) from the remaining nodes to the test split <br>
rest of the data to train split (Doesnt get any) <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>

### split = 'test_rest'
all nodes except those in the training and validation sets will be used for test

#### split = 'test_rest', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.1
num_train_per_class = 1 gives 1 node for each class to the train split <br>
num_val = 0.1 gives (0.1 * total number of nodes) to the val split <br>
**num_test = regardless of this number**, the entire nodes apart from the train and val split will be given to test split <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>

#### split = 'test_rest', num_splits = 1, num_train_per_class = 10, num_val = 0.9, num_test = 0.8
**Who gets first???** <br>
**train split first, then val, then test** <br>
num_train_per_class = 1 gives 1 node for each class to the train split <br>
num_val = 0.8 gives (0.8 * total number of nodes) to the val split which are not already in the train split <br>
rest of the data to test split (Doesnt get any) <br>
train, test and val dont overlap <br>
<br> --------------------------------------------------------------------------------------------------------------- <br>

#### Split Homogeneous Data

In [192]:
from torch_geometric.transforms import RandomNodeSplit
from torch_geometric import transforms as T
from copy import deepcopy
import torch

print(f'||||||||||||| Homogeneous Section |||||||||||||\n')
print()
print(f'----------- Before Transform --------------\n')

# add custom y attritbute before node level splitting
hm1 = deepcopy(homogeneous)

print(f'Number of nodes in homogeneous : {hm1.num_nodes}')
hm1.y = torch.randint(0, 2, (hm1.num_nodes, ))
hm1.num_classes = 2
print(hm1)
print()
print(f'Node classes of homogeneous : {hm1.y}')
print(f'Edge attributes of homogeneous : {hm1.edge_attr}')

# transform = RandomNodeSplit(split = 'random', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.1, key = 'y')
# transform = RandomNodeSplit(split = 'random', num_splits = 1, num_train_per_class = 2, num_val = 0.3, num_test = 0.5, key = 'y')
# will leave nothing for train and val
# transform = RandomNodeSplit(split = 'random', num_splits = 1, num_train_per_class = 50, num_val = 0.3, num_test = 0.5, key = 'y')

# transform = RandomNodeSplit(split = 'train_rest', num_splits = 1, num_train_per_class = 1, num_val = 0.8, num_test = 0.8, key = 'y')
# transform = RandomNodeSplit(split = 'train_rest', num_splits = 1, num_train_per_class = 2, num_val = 0.5, num_test = 0.5, key = 'y')
# transform = RandomNodeSplit(split = 'train_rest', num_splits = 1, num_train_per_class = 50, num_val = 0.5, num_test = 0.5, key = 'y')

# test will not get only 1 or 2 nodes for 0.1 ratio, rather the entire set of remaining nodes 
# transform = RandomNodeSplit(split = 'test_rest', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.1, key = 'y')
# transform = RandomNodeSplit(split = 'test_rest', num_splits = 1, num_train_per_class = 2, num_val = 0.3, num_test = 0.5, key = 'y')
# will leave nothing for val and test
# transform = RandomNodeSplit(split = 'test_rest', num_splits = 1, num_train_per_class = 10, num_val = 0.9, num_test = 0.8, key = 'y')

hm1 = transform(hm1)
print()
print(f'----------- After Transform --------------\n')
print(hm1)

print(f'\nAll the split nodes :')
print(f'train nodes : {hm1.train_mask.nonzero().view(-1).tolist()}')
print(f'train node classes : {hm1.y[hm1.train_mask]}')
print(f'val nodes : {hm1.val_mask.nonzero().view(-1).tolist()}')
print(f'test nodes : {hm1.test_mask.nonzero().view(-1).tolist()}')
print()
print(f'Number of train nodes : {hm1.train_mask.sum().item()}')
print(f'Number of validation nodes : {hm1.val_mask.sum().item()}')
print(f'Number of test nodes : {hm1.test_mask.sum().item()}')
print()
print(f'---------------------------------------------\n')

||||||||||||| Homogeneous Section |||||||||||||


----------- Before Transform --------------

Number of nodes in homogeneous : 13
Data(x=[13, 1], edge_index=[2, 28], edge_attr=[28], y=[13], num_classes=2)

Node classes of homogeneous : tensor([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0])
Edge attributes of homogeneous : tensor([3, 2, 2, 3, 1, 2, 1, 2, 3, 2, 5, 5, 5, 4, 5, 4, 1, 3, 6, 2, 6, 4, 1, 4,
        5, 1, 5, 1])

----------- After Transform --------------

Data(x=[13, 1], edge_index=[2, 28], edge_attr=[28], y=[13], num_classes=2, train_mask=[13], val_mask=[13], test_mask=[13])

All the split nodes :
train nodes : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
train node classes : tensor([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0])
val nodes : []
test nodes : []

Number of train nodes : 13
Number of validation nodes : 0
Number of test nodes : 0

---------------------------------------------



#### Split Heterogeneous Data

##### 1. Collect the nodetypes and edgetypes first

In [None]:
ht1 = deepcopy(heterogeneous)
node_types = ht1.node_types
edge_types = ht1.edge_types

get_ht_graph_stats(ht1)

##### 2. Add y attribute based on the nodetypes (Consider 3 classes 0, 1 and 2)

In [146]:
print(f'||||||||||||| Heterogeneous Section |||||||||||||\n')
print()
print(f'----------- Before Transform --------------\n')
# add custom y attritbute before node level splitting

for node_type in node_types:
    ht1[node_type].y = torch.randint(0, 3, (ht1[node_type].num_nodes, ))
ht1.num_classes = 3

print(ht1)
print()
for node_type in node_types:
    print(f'Node classes of node type {node_type} : {ht1[node_type].y}')

transform = RandomNodeSplit(split = 'train_rest', num_splits = 1, num_train_per_class = 1, num_val = 0.1, num_test = 0.2, key = 'y')
ht1 = transform(ht1)

print()
print(f'----------- After Transform --------------\n')
print(ht1)

for node_type in node_types:
    print(f'\n *** Node Type {node_type} ***\n')
    print(f'train mask : {ht1[node_type].train_mask.nonzero().view(-1).tolist()}')
    print(f'val mask : {ht1[node_type].val_mask.nonzero().view(-1).tolist()}')
    print(f'test mask : {ht1[node_type].test_mask.nonzero().view(-1).tolist()}')
    print()
    print(f'Number of train nodes : {ht1[node_type].train_mask.sum().item()}')
    print(f'Number of validation nodes : {ht1[node_type].val_mask.sum().item()}')
    print(f'Number of test nodes : {ht1[node_type].test_mask.sum().item()}')
    print()
print(f'---------------------------------------------\n')

||||||||||||| Heterogeneous Section |||||||||||||


----------- Before Transform --------------

HeteroData(
  num_classes=3,
  member={
    x=[13, 1],
    y=[13],
    train_mask=[13],
    val_mask=[13],
    test_mask=[13],
  },
  team={
    x=[31, 1],
    y=[31],
    train_mask=[31],
    val_mask=[31],
    test_mask=[31],
  },
  skill={
    x=[10, 1],
    y=[10],
    train_mask=[10],
    val_mask=[10],
    test_mask=[10],
  },
  (skill, to, team)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (member, to, team)={
    edge_index=[2, 68],
    edge_attr=[68],
  },
  (team, rev_to, skill)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (team, rev_to, member)={
    edge_index=[2, 68],
    edge_attr=[68],
  }
)

Node classes of node type member : tensor([0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2])
Node classes of node type team : tensor([0, 2, 2, 0, 1, 2, 1, 2, 1, 1, 1, 2, 0, 2, 2, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 2, 2, 1, 2])
Node classes of node type skill : tensor([2, 2

### Run a Classification Model (GCN) for Homogeneous Data

In [147]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(hm1.num_node_features, 16)
        self.conv2 = GCNConv(16, hm1.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [158]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
hm1 = hm1.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

model.train()
for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    out = model(hm1)
    loss = F.nll_loss(out[hm1.train_mask], hm1.y[hm1.train_mask])
    loss.backward()
    optimizer.step()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 171.99it/s]


In [159]:
model.eval()
pred = model(hm1).argmax(dim=1)
print(f'pred = {pred}')
correct = (pred[hm1.test_mask] == hm1.y[hm1.test_mask]).sum()
acc = int(correct) / int(hm1.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

pred = tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Accuracy: 0.6667


# --------------------------------------------------

# Random Link Split

Performs an edge-level random split into training, validation and test sets of a Data or a HeteroData object (functional name: random_link_split). The split is performed such that the training split does not include edges in validation and test splits; and the validation split does not include edges in the test split. <br>

https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.RandomLinkSplit.html


In [9]:
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric import transforms as T
from copy import deepcopy
import torch

In [17]:
print(homogeneous)

Data(x=[13, 1], edge_index=[2, 28], edge_attr=[28])


In [18]:
print(heterogeneous)

HeteroData(
  member={ x=[13, 1] },
  team={ x=[31, 1] },
  skill={ x=[10, 1] },
  (skill, to, team)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (member, to, team)={
    edge_index=[2, 68],
    edge_attr=[68],
  },
  (team, rev_to, skill)={
    edge_index=[2, 72],
    edge_attr=[72],
  },
  (team, rev_to, member)={
    edge_index=[2, 68],
    edge_attr=[68],
  }
)


In [19]:
hm = deepcopy(homogeneous)
ht = deepcopy(heterogeneous)

In [20]:
transform = RandomLinkSplit(is_undirected = True)
train, val, test = transform(hm)

In [21]:
# homogeneous stats
get_hm_graph_stats(train)


------------- Homogeneous Graph Stats --------------

Type of graph : Undirected

Number of nodes : 13
Number of edges : 22

Edge List : 
tensor([[ 0,  1],
        [ 3,  7],
        [ 7,  8],
        [ 4,  5],
        [ 9, 10],
        [ 9, 11],
        [ 0,  3],
        [ 6, 12],
        [ 0,  2],
        [ 3,  8],
        [ 1,  2],
        [ 1,  0],
        [ 7,  3],
        [ 8,  7],
        [ 5,  4],
        [10,  9],
        [11,  9],
        [ 3,  0],
        [12,  6],
        [ 2,  0],
        [ 8,  3],
        [ 2,  1]])

------------------------------



In [23]:
get_hm_graph_stats(val)


------------- Homogeneous Graph Stats --------------

Type of graph : Undirected

Number of nodes : 13
Number of edges : 22

Edge List : 
tensor([[ 0,  1],
        [ 3,  7],
        [ 7,  8],
        [ 4,  5],
        [ 9, 10],
        [ 9, 11],
        [ 0,  3],
        [ 6, 12],
        [ 0,  2],
        [ 3,  8],
        [ 1,  2],
        [ 1,  0],
        [ 7,  3],
        [ 8,  7],
        [ 5,  4],
        [10,  9],
        [11,  9],
        [ 3,  0],
        [12,  6],
        [ 2,  0],
        [ 8,  3],
        [ 2,  1]])

------------------------------



In [24]:
get_hm_graph_stats(test)


------------- Homogeneous Graph Stats --------------

Type of graph : Undirected

Number of nodes : 13
Number of edges : 24

Edge List : 
tensor([[ 0,  1],
        [ 3,  7],
        [ 7,  8],
        [ 4,  5],
        [ 9, 10],
        [ 9, 11],
        [ 0,  3],
        [ 6, 12],
        [ 0,  2],
        [ 3,  8],
        [ 1,  2],
        [ 4,  6],
        [ 1,  0],
        [ 7,  3],
        [ 8,  7],
        [ 5,  4],
        [10,  9],
        [11,  9],
        [ 3,  0],
        [12,  6],
        [ 2,  0],
        [ 8,  3],
        [ 2,  1],
        [ 6,  4]])

------------------------------

