# MC-SVD Procedure - Triad Prediction

In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn import init
from random import shuffle, randint
import torch.nn.functional as F
from torch_geometric.datasets import Reddit, PPI, Planetoid
from itertools import combinations, combinations_with_replacement
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
import pickle

## Define the dataset, the type of prediction and the number of samples

In [2]:
DATASET = 'cora'
PREDICTION = 'triad'
RUN_COUNT = 1
NUM_SAMPLES = 1
PATH_TO_DATASETS_DIRECTORY = './'

In [3]:
datasets = {
    'reddit': Reddit(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/Reddit'),
    'cora' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/Cora/', name='Cora'),
    'citeseer' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/CiteSeer/', name='CiteSeer'),
    'pubmed' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/PubMed/', name='PubMed'),
}
dataset = datasets[DATASET]
data = dataset[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
predictions = {
    'node' : dataset.num_classes,
    'link' : 2,
    'triad' : 4,
}

In [5]:
dataset_types = ['train', 'validation', 'test']
triads_stores = dict()
triad_loc = '/scratch-ml00/bsriniv/pos2struct/triad_store/'

for set_nature in dataset_types :
    zero_filename = triad_loc + set_nature + '/' + DATASET + '_triad_zero.pickle'
    one_filename = triad_loc + set_nature + '/' + DATASET + '_triad_one.pickle'
    two_filename = triad_loc + set_nature + '/' + DATASET + '_triad_two.pickle'
    three_filename = triad_loc + set_nature + '/' + DATASET + '_triad_three.pickle'
    with open(zero_filename, 'rb') as f:
        zeros = pickle.load(f)
    with open(one_filename, 'rb') as f:
        ones = pickle.load(f)
    with open(two_filename, 'rb') as f:
        twos = pickle.load(f)
    with open(three_filename, 'rb') as f:
        threes = pickle.load(f)
    triads_stores[set_nature] = dict()
    triads_stores[set_nature]['zeros'] = zeros
    triads_stores[set_nature]['ones'] = ones
    triads_stores[set_nature]['twos'] = twos
    triads_stores[set_nature]['threes'] = threes

In [6]:
print("Printing Dataset Characteristics")
print("Name: ", DATASET)
print("Total Number of Nodes: ", data.num_nodes)
print("Total Number of Training Nodes: ", data.train_mask.sum().item())
print("Total Number of Val Nodes: ", data.val_mask.sum().item())
print("Total Number of Test Nodes: ", data.test_mask.sum().item())
print("Num Node Features: ", data.num_features)
print("Num Node Classes: ", dataset.num_classes)
print("Number of Edges: ", data.edge_index.shape[1])
print("Number of Samples for structural: ", NUM_SAMPLES)
print("Prediction Type: ", PREDICTION)

Printing Dataset Characteristics
Name:  cora
Total Number of Nodes:  2708
Total Number of Training Nodes:  140
Total Number of Val Nodes:  500
Total Number of Test Nodes:  1000
Num Node Features:  1433
Num Node Classes:  7
Number of Edges:  10556
Number of Samples for structural:  1
Prediction Type:  triad


In [7]:
# data.train_mask = 1 - data.val_mask - data.test_mask
data.train_mask = ~data.val_mask * ~data.test_mask

adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
edges = data.edge_index.t()
adj_mat[edges[:,0], edges[:,1]] = 1

## Build the non-overlapping induced subgraphs

In [8]:
adj_train = adj_mat[data.train_mask].t()[data.train_mask].t()
adj_validation = adj_mat[data.val_mask].t()[data.val_mask].t()
adj_test = adj_mat[data.test_mask].t()[data.test_mask].t()



## Corrupt a small fraction of the edges

In [9]:
def add_edges(adj_mat, triplet):
    id_0, id_1, id_2 = triplet[0], triplet[1], triplet[2]
    if adj_mat[id_0, id_1] == 1:
        if adj_mat[id_1, id_2] == 1 :
            adj_mat[id_0, id_2] = 1
            adj_mat[id_2, id_0] = 1
        else:
            adj_mat[id_1, id_2] = 1
            adj_mat[id_2, id_1] = 1
    else:
        adj_mat[id_0, id_1] = 1
        adj_mat[id_1, id_0] = 1
    return adj_mat

def delete_edges(adj_mat, triplet):
    id_0, id_1, id_2 = triplet[0], triplet[1], triplet[2]
    if adj_mat[id_0, id_1] == 0:
        if adj_mat[id_1, id_2] == 0 :
            adj_mat[id_0, id_2] = 0
            adj_mat[id_2, id_0] = 0
        else:
            adj_mat[id_1, id_2] = 0
            adj_mat[id_2, id_1] = 0
    else:
        adj_mat[id_0, id_1] = 0
        adj_mat[id_1, id_0] = 0
    return adj_mat


def corrupt_adj_triads(adj_mat, set_nature, percent=1):
    """Returns the corrupted adjacency matrix """
    zeros_shape = triads_stores[set_nature]['zeros'].shape[0]
    ones_shape = triads_stores[set_nature]['ones'].shape[0]
    twos_shape = triads_stores[set_nature]['twos'].shape[0]
    threes_shape = triads_stores[set_nature]['threes'].shape[0]
    min_value = min(zeros_shape, ones_shape, twos_shape, threes_shape)
    num_to_corrupt = int(min_value * percent/100.0) + 1
    false_zeros, false_ones, false_twos, false_threes = [], [], [], []
    true_zeros, true_ones, true_twos, true_threes = [], [], [], []
    adj_mat_corrupted = adj_mat.clone()

    #Threes Corruption
    temp = [0, 1, 2]
    random_threes_corruption = list(np.random.randint(threes_shape, size=min(num_to_corrupt,threes_shape)))
    true_ = triads_stores[set_nature]['threes'].view(-1,3)
    for t in true_[random_threes_corruption]:
        t = t.type(torch.long)
        true_threes.append(t)
        shuffle(temp)
        num_to_change = 3 - temp[0]
        if temp[0] == 0 :
            false_zeros.append(t)
        elif temp[0] == 1:
            false_ones.append(t)
        else :
            false_twos.append(t)

        for ch in range(num_to_change):
            adj_mat_corrupted = delete_edges(adj_mat_corrupted, t)

    #Twos Corruption
    temp = [0, 1 ,3]
    random_twos_corruption = list(np.random.randint(twos_shape, size=num_to_corrupt))
    true_ = triads_stores[set_nature]['twos'].view(-1,3)
    for t in true_[random_twos_corruption]:
        shuffle(temp)
        t = t.type(torch.long)
        true_twos.append(t)
        num_to_change = 2 - temp[0]
        if temp[0] == 0 :
            false_zeros.append(t)
        elif temp[0] == 1:
            false_ones.append(t)
        else :
            false_threes.append(t)

        if num_to_change > 0 :
            for ch in range(num_to_change):
                adj_mat_corrupted = delete_edges(adj_mat_corrupted, t)
        else :
            for ch in range(abs(num_to_change)):
                adj_mat_corrupted = add_edges(adj_mat_corrupted, t)


    #Ones Corruption
    temp = [0, 2 ,3]
    random_ones_corruption = list(np.random.randint(ones_shape, size=num_to_corrupt))
    true_ = triads_stores[set_nature]['ones'].view(-1,3)
    for t in true_[random_ones_corruption]:
        shuffle(temp)
        t = t.type(torch.long)
        true_ones.append(t)
        num_to_change = 1 - temp[0]
        if temp[0] == 0 :
            false_zeros.append(t)
        elif temp[0] == 2:
            false_twos.append(t)
        else :
            false_threes.append(t)

        if num_to_change > 0 :
            for ch in range(num_to_change):
                adj_mat_corrupted = delete_edges(adj_mat_corrupted, t)
        else :
            for ch in range(abs(num_to_change)):
                adj_mat_corrupted = add_edges(adj_mat_corrupted, t)


    #Zeros Corruption
    temp = [1, 2 ,3]
    random_zeros_corruption = list(np.random.randint(zeros_shape, size=num_to_corrupt))
    true_ = triads_stores[set_nature]['zeros'].view(-1,3)
    for t in true_[random_zeros_corruption]:
        shuffle(temp)
        t = t.type(torch.long)
        true_zeros.append(t)
        num_to_change = 0 - temp[0]
        if temp[0] == 1 :
            false_ones.append(t)
        elif temp[0] == 2:
            false_twos.append(t)
        else :
            false_threes.append(t)

        for ch in range(abs(num_to_change)):
            adj_mat_corrupted = add_edges(adj_mat_corrupted, t)


    return adj_mat_corrupted, true_zeros, true_ones, true_twos, true_threes

In [10]:
adj_train_corrupted, train_true_zeros,train_true_ones, train_true_twos, train_true_threes = corrupt_adj_triads(adj_train, 'train', percent=1)
adj_val_corrupted, val_true_zeros,val_true_ones, val_true_twos, val_true_threes = corrupt_adj_triads(adj_validation, 'validation', percent=1)
adj_test_corrupted, test_true_zeros,train_true_ones, test_true_twos, test_true_threes = corrupt_adj_triads(adj_test, 'test', percent=1)

## Define the Supervised Learning Network

In [11]:
num_neurons = 256
input_rep = num_neurons + data.num_features

class StructMLP(nn.Module):
    def __init__(self, node_set_size=1):
        super(StructMLP, self).__init__()

        self.node_set_size = node_set_size
        #Deepsets MLP

        self.ds_layer_1 = nn.Linear(input_rep, num_neurons)
        self.ds_layer_2 = nn.Linear(num_neurons, num_neurons)
        self.rho_layer_1 = nn.Linear(num_neurons, num_neurons)
        self.rho_layer_2 = nn.Linear(num_neurons, num_neurons)

        #One Hidden Layer
        self.layer1 = nn.Linear(num_neurons, num_neurons)
        self.layer2 = nn.Linear(num_neurons, predictions[PREDICTION])
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_tensor, samples):
        #Deepsets initially on each of the samples
        num_nodes = input_tensor.shape[1]
        sum_tensor = torch.zeros(samples.shape[0], num_neurons).to(device)
        for i in range(input_tensor.shape[0]):
            #Process the input tensor to form n choose k combinations and create a zero tensor
            set_init_rep = input_tensor[i].view(-1, input_rep)
            x = self.ds_layer_1(set_init_rep)
            x = self.relu(x)
            x = self.ds_layer_2(x)
            x = x[samples]
            x = torch.sum(x, dim=1)
            x = self.rho_layer_1(x)
            sum_tensor += x

        x = sum_tensor / input_tensor.shape[0]

        #One Hidden Layer for predictor
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

    def compute_loss(self, input_tensor, samples, target):
        pred = self.forward(input_tensor, samples)
        return F.cross_entropy(pred, target)

In [12]:
if PREDICTION == 'node':
    node_set_size = 1
elif PREDICTION == 'link':
    node_set_size = 2
else:
    node_set_size = 3

mlp = StructMLP(node_set_size).to(device)
mlp_optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)
mlp_model = 'best_mlp_model.model'

## Training the Supervised Learning Network

In [13]:
def sample_triads(set_nature, small_samples=100):
    zeros_shape = triads_stores[set_nature]['zeros'].shape[0]
    ones_shape = triads_stores[set_nature]['ones'].shape[0]
    twos_shape = triads_stores[set_nature]['twos'].shape[0]
    threes_shape = triads_stores[set_nature]['threes'].shape[0]

    zeros = triads_stores[set_nature]['zeros'][np.random.randint(zeros_shape, size=min(small_samples, zeros_shape))]
    ones = triads_stores[set_nature]['ones'][np.random.randint(ones_shape, size=min(small_samples, ones_shape))]
    twos = triads_stores[set_nature]['twos'][np.random.randint(twos_shape, size=min(small_samples, twos_shape))]
    threes = triads_stores[set_nature]['threes'][np.random.randint(threes_shape, size=min(small_samples, threes_shape))]

    target_zeros = torch.zeros(zeros.shape[0])
    target_ones = torch.ones(ones.shape[0])
    target_twos = 2.0 * torch.ones(twos.shape[0])
    target_threes = 3.0 * torch.ones(threes.shape[0])

    out = torch.cat((zeros, ones, twos, threes), dim=0).view(-1,3).type(torch.long)
    target = torch.cat((target_zeros, target_ones, target_twos, target_threes), dim=0).type(torch.long)
    return out.to(device), target.to(device)

In [14]:
epochs = 50
validation_loss = 10000.0
small_samples = 200
for num_epoch in range(epochs):
    mlp_optimizer.zero_grad()
    numbers = list(np.random.randint(500, size=NUM_SAMPLES))
    hidden_samples_train = []
    for number in numbers :
        svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
        u_train = svd.fit_transform(adj_train_corrupted)
        hidden_samples_train.append(torch.Tensor(u_train).to(device))
    for i in range(NUM_SAMPLES):
        hidden_samples_train[i] = torch.cat((hidden_samples_train[i].to(device), data.x[data.train_mask].to(device)),1)
    input_ = torch.stack(hidden_samples_train)
    input_ = input_.detach()
    sampled, target = sample_triads('train', small_samples=small_samples)
    loss = mlp.compute_loss(input_, sampled, target=target)
    print("Training Loss: ", loss.item())
    with torch.no_grad():
        #Do Validation and check if validation loss has gone down
        numbers = list(np.random.randint(500, size=NUM_SAMPLES))
        hidden_samples_validation = []
        for number in numbers :
            svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
            u_validation = svd.fit_transform(adj_val_corrupted)
            hidden_samples_validation.append(torch.Tensor(u_validation).to(device))
        for i in range(NUM_SAMPLES):
            hidden_samples_validation[i] = torch.cat((hidden_samples_validation[i].to(device), data.x[data.val_mask].to(device)),1)
        input_val = torch.stack(hidden_samples_validation)
        input_val = input_val.detach()
        sampled, target = sample_triads('validation', small_samples=small_samples)
        compute_val_loss = mlp.compute_loss(input_val, sampled, target=target)
        if compute_val_loss < validation_loss:
            validation_loss = compute_val_loss
            print("Validation Loss: ", validation_loss)
            #Save Model
            torch.save(mlp.state_dict(), mlp_model)
    loss.backward()
    mlp_optimizer.step()



Training Loss:  1.388076901435852




Validation Loss:  tensor(1.4070, device='cuda:0')




Training Loss:  1.3758803606033325




Validation Loss:  tensor(1.3801, device='cuda:0')




Training Loss:  1.3588989973068237




Validation Loss:  tensor(1.3653, device='cuda:0')




Training Loss:  1.3375961780548096




Validation Loss:  tensor(1.3594, device='cuda:0')




Training Loss:  1.2995332479476929




Validation Loss:  tensor(1.3568, device='cuda:0')




Training Loss:  1.2551569938659668




Training Loss:  1.219376802444458




Training Loss:  1.172326922416687




Training Loss:  1.1540580987930298




Validation Loss:  tensor(1.3551, device='cuda:0')




Training Loss:  1.0636274814605713




Training Loss:  1.0654171705245972




Training Loss:  1.02242112159729




Validation Loss:  tensor(1.3258, device='cuda:0')




Training Loss:  0.9478728771209717




Validation Loss:  tensor(1.2545, device='cuda:0')




Training Loss:  0.9571122527122498




Training Loss:  0.8732000589370728




Training Loss:  0.9524827599525452




Training Loss:  0.852349042892456




Training Loss:  0.8929553627967834




Training Loss:  0.8130388855934143




Training Loss:  0.859551191329956




Training Loss:  0.8242541551589966




Training Loss:  0.8387501239776611




Training Loss:  0.8152374625205994




Training Loss:  0.7623845338821411




Training Loss:  0.76494300365448




Training Loss:  0.7861733436584473




Training Loss:  0.7909943461418152




Training Loss:  0.8055086135864258




Training Loss:  0.7689651250839233




Training Loss:  0.7286304831504822




Training Loss:  0.7360619306564331




Training Loss:  0.7838864922523499




Training Loss:  0.7702267169952393




Training Loss:  0.7846922278404236




Training Loss:  0.7462897300720215




Training Loss:  0.7779768109321594




Training Loss:  0.7126182913780212




Training Loss:  0.7177548408508301




Training Loss:  0.7467566728591919




Training Loss:  0.7496386170387268




Training Loss:  0.7568770051002502




Training Loss:  0.7021830677986145




Training Loss:  0.7253904342651367




Training Loss:  0.7221789956092834




Training Loss:  0.7048722505569458




Training Loss:  0.6918832063674927




Training Loss:  0.7135232090950012




Training Loss:  0.7230628728866577




Training Loss:  0.7128265500068665




Training Loss:  0.7389586567878723




## Load the best model

In [15]:
mlp = StructMLP(node_set_size).to(device)
mlp.load_state_dict(torch.load(mlp_model))

<All keys matched successfully>

## Forward pass on the test graphs

In [16]:
numbers = list(np.random.randint(500, size=NUM_SAMPLES))
hidden_samples_test = []
for number in numbers :
    svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
    u_test = svd.fit_transform(adj_test_corrupted)
    hidden_samples_test.append(torch.Tensor(u_test).to(device))
for i in range(NUM_SAMPLES):
    hidden_samples_test[i] = torch.cat((hidden_samples_test[i].to(device), data.x[data.test_mask].to(device)),1)

    
small_samples = 200
sampled_test, target_test = sample_triads('test', small_samples)


t_test = target_test.to("cpu").numpy()
input_test = torch.stack(hidden_samples_test)
input_test = input_test.detach()

with torch.no_grad():
    test_pred = mlp.forward(input_test, sampled_test)
    pred = F.log_softmax(test_pred, dim=1)
pred = pred.detach().to("cpu").numpy()
pred = np.argmax(pred, axis=1)



## Test results

In [17]:
print("Test Micro F1 Score: ", f1_score(t_test, pred, average='micro'))
print("Test Weighted F1 Score: ", f1_score(t_test, pred, average='weighted'))
print("Test Accuracy Score: ", accuracy_score(t_test, pred))

Test Micro F1 Score:  0.40425531914893614
Test Weighted F1 Score:  0.34858056737411497
Test Accuracy Score:  0.40425531914893614
