In [12]:
pip install dgl

Note: you may need to restart the kernel to use updated packages.


In [13]:
# model
import torch.nn as nn
import dgl.nn
from dgl.nn import GATConv, SGConv, SAGEConv


class GAT(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, num_layers, dropout_rate, activation):
        super(GAT, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(GATConv(in_feats, hidden_size, num_heads=1, activation=activation, allow_zero_in_degree=True))
        for _ in range(num_layers - 2):
            self.layers.append(GATConv(hidden_size, hidden_size, num_heads=1, activation=activation, allow_zero_in_degree=True))
        self.layers.append(GATConv(hidden_size, num_classes, num_heads=1, activation=None, allow_zero_in_degree=True))
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, g, features):
        h = features
        for layer in self.layers[:-1]:
            h = layer(g, h).flatten(1)
            h = self.dropout(h)
        h = self.layers[-1](g, h).mean(1)  # No activation on the final layer
        return h


class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, activation, num_layers, dropout_rate):
        super(GCN, self).__init__()
        layers = [
            dgl.nn.GraphConv(in_feats, hidden_size, activation=activation, allow_zero_in_degree=True)
        ]
        for _ in range(num_layers - 2):
            layers.append(dgl.nn.GraphConv(hidden_size, hidden_size, activation=activation, allow_zero_in_degree=True))
        layers.append(dgl.nn.GraphConv(hidden_size, num_classes, allow_zero_in_degree=True))
        self.layers = nn.ModuleList(layers)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
            h = self.dropout(h)
        return h

# Add Simplifying Graph Convolutional Networks (SGC)
class SGC(nn.Module):
    def __init__(self, in_feats, num_classes, num_layers, activation, out_feats=256):  # Set an appropriate value for out_feats
        super(SGC, self).__init__()
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(SGConv(in_feats, out_feats, allow_zero_in_degree=True))  # Use out_feats instead of num_classes
            in_feats = out_feats  # Update in_feats for the next layer
        self.activation = activation  # Store the activation function

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
            h = self.activation(h)  # Apply the activation function
        return h

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, num_classes, num_layers, activation, out_feats=128):
        super(GraphSAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGEConv(in_feats, out_feats, 'mean'))  # Use 'mean' aggregation for the first layer
        for _ in range(num_layers - 2):
            self.layers.append(SAGEConv(out_feats, out_feats, 'mean'))
        self.layers.append(SAGEConv(out_feats, num_classes, 'mean'))
        self.activation = activation

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
            h = self.activation(h)
        return h

In [14]:
import dgl
import torch
import dgl.data
import os
import numpy as np

from tensorflow import keras

# Define functions to calculate precision, recall, and F-measure
def calculate_metrics(pred, labels):
    tp = (pred * labels).sum().float()
    fp = ((pred == 1) * (labels == 0)).sum().float()
    fn = ((pred == 0) * (labels == 1)).sum().float()

    micro_precision = tp / (tp + fp + 1e-10)
    micro_recall = tp / (tp + fn + 1e-10)

    unique_labels = labels.unique()
    macro_precision = 0.0
    macro_recall = 0.0

    for label in unique_labels:
        tp_i = ((pred == label) * (labels == label)).sum().float()
        fp_i = ((pred == label) * (labels != label)).sum().float()
        fn_i = ((pred != label) * (labels == label)).sum().float()

        precision_i = tp_i / (tp_i + fp_i + 1e-10)
        recall_i = tp_i / (tp_i + fn_i + 1e-10)

        macro_precision += precision_i
        macro_recall += recall_i

    macro_precision /= len(unique_labels)
    macro_recall /= len(unique_labels)

    f_measure = 2 * (macro_precision * macro_recall) / (macro_precision + macro_recall + 1e-10)

    return macro_precision.item(), micro_precision.item(), macro_recall.item(), micro_recall.item(), f_measure.item()

# Function to load the MNIST dataset
def load_mnist_dataset():
    (train_data, train_labels), (test_data, test_labels) = keras.datasets.mnist.load_data()
    num_train = len(train_data)
    num_test = len(test_data)
    num_nodes = num_train + num_test

    g = dgl.DGLGraph()
    g.add_nodes(num_nodes)
    mnist_data = torch.cat([torch.Tensor(train_data).view(num_train, -1), torch.Tensor(test_data).view(num_test, -1)], dim=0)
    g.ndata['feat'] = mnist_data
    g.add_edges(range(num_nodes), range(num_nodes))
    
    labels = torch.cat([torch.Tensor(train_labels), torch.Tensor(test_labels)], dim=0)
    labels = labels.long() 
    g.ndata['label'] = labels

    # Create mask features
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[:num_train] = True
    val_mask[num_train:num_train+1000] = True  # Assuming 1000 validation samples
    test_mask[num_train+1000:] = True  # Assuming the rest are test samples

    g.ndata['train_mask'] = train_mask
    g.ndata['val_mask'] = val_mask
    g.ndata['test_mask'] = test_mask
    g.num_classes = 10
    return g

def load_nell():
    data = torch.load('data/nell/processed/data.pt')
    data = data[0]
    num_nodes = data.num_nodes
    edge_index = data.edge_index
    # Convert 'x' to a dense tensor
    x = data.x
    if x is not None:
        x = x.to_dense()

    y = data.y  # Node labels
    train_mask = data.train_mask
    val_mask = data.val_mask
    test_mask = data.test_mask

    # Create an empty DGL graph
    g = dgl.DGLGraph()
    
    # Add nodes to the graph and explicitly set the 'num_nodes' attribute
    g.add_nodes(num_nodes)

    # Add edges to the graph
    g.add_edges(edge_index[0], edge_index[1])

    # If you have node features, you can set them for the DGL graph
    if x is not None:
        g.ndata['feat'] = x
    # Set node labels
    if y is not None:
        g.ndata['label'] = y

    # Set masks for training, validation, and testing
    if train_mask is not None:
        g.ndata['train_mask'] = train_mask
    if val_mask is not None:
        g.ndata['val_mask'] = val_mask
    if test_mask is not None:
        g.ndata['test_mask'] = test_mask
    g.num_classes = 186
    return g

# Function to load datasets
def load_dataset(dataset_name):
    if dataset_name in ['cora', 'citeseer', 'pubmed']:
        dataset = dgl.data.__dict__[f'{dataset_name.capitalize()}GraphDataset']()
    elif dataset_name == 'mnist':
        dataset = load_mnist_dataset()
    elif dataset_name == 'reddit':
        dataset = dgl.data.RedditDataset()
    elif dataset_name == 'nell':
        dataset = load_nell()
    elif dataset_name == 'blogcatalog':
        dataset = load_blogcatalog()
    return dataset

def load_blogcatalog():
    # Load the dataset
    dataset = dgl.data.CSVDataset('data/blogcatalog_dataset')

    # Extract the graph (assuming there's only one graph in the dataset)
    g = dataset[0]

    # Generate random features for the nodes
    num_nodes = g.number_of_nodes()
    feat_dim = 64
    feat = torch.randn(num_nodes, feat_dim)
    g.ndata['feat'] = feat

    # Define the number of classes (assuming a specific number, modify as needed)
    num_classes = 39

    # Generate random labels for the nodes
    labels = torch.randint(0, num_classes, (num_nodes,))
    g.ndata['label'] = labels

    # Create masks for train, validation, and test sets
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    # Assuming 60% train, 20% validation, and 20% test split
    train_mask[:int(0.6 * num_nodes)] = 1
    val_mask[int(0.6 * num_nodes):int(0.8 * num_nodes)] = 1
    test_mask[int(0.8 * num_nodes):] = 1

    # Assign masks to the graph
    g.ndata['train_mask'] = train_mask
    g.ndata['val_mask'] = val_mask
    g.ndata['test_mask'] = test_mask

    # Retrieve the number of classes
    num_classes = len(torch.unique(g.ndata['label']))
    g.num_classes = num_classes
    # print("Number of classes:", num_classes)
    return g

In [15]:
# activation function
import torch
import torch.nn.functional as F

# ReLU
def relu(x):
    return F.relu(x)

# Leaky ReLU
def leaky_relu(x, alpha=0.01):
    return F.leaky_relu(x, negative_slope=alpha)

# Tanh
def tanh(x):
    return torch.tanh(x)

# ELU
def elu(x, alpha=1.0):
    return F.elu(x, alpha=alpha)

# Swish
def swish(x):
    return x * torch.sigmoid(x)

# Mish
def mish(x):
    return x * torch.tanh(F.softplus(x))

# Smish
def smish(x):
    return x * torch.sigmoid(1.702 * x)

# LiSHT
def lisht(x):
    return x * torch.tanh(x)

# HardSeReLU
def hard_selerelu(x):
    return torch.where(x < 0, torch.mul(0.2, torch.expm1(x)), torch.mul(0.2 * F.relu(x), torch.clamp((x + 1) / 2, min=0, max=1)))


In [None]:
torch.cuda.is_available()

In [16]:
import dgl
import torch
import torch.nn.functional as F
import os
import pandas as pd
import time

# from gnn.model import GCN, GAT, GraphSAGE, SGC
# from utils.tools import calculate_metrics, load_dataset
# import gnn.activation_function as af

import warnings
warnings.filterwarnings("ignore")

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# dataset_names = ['pubmed', 'cora', 'citeseer', 'blogcatalog', 'reddit', 'mnist', 'nell']
dataset_names = ['reddit']

# gnn_models = [GCN, GAT, GraphSAGE, SGC]
gnn_models = [GAT]

# activation_functions = [af.relu, af.tanh, af.leaky_relu, af.elu, af.swish, af.mish, af.lisht, af.smish, af.hard_selerelu]
activation_functions = [tanh]

# num_epochs_list = [10, 50, 100, 250, 500]
num_epochs_list = [10]

# learning_rates = [0.01, 0.00001]
learning_rates = [0.01]

num_layers_list = [2, 4]
# num_layers_list = [2]

# dropout_rates = [0.05]
dropout_rates = [0.1, 0.05]

hidden_sizes = [16, 32]
# hidden_sizes = [16]

results_df = pd.DataFrame(columns=['Dataset', 'Model', 'Activation', 'Epoch', 'Loss', 'Val Acc', 'Best Val Acc', 'Test Acc', 'Best Test Acc', 'Macro Precision', 'Micro Precision', 'Macro Recall', 'Micro Recall', 'F-measure', 'Computation Time (seconds)', 'Num Epochs', 'Learning Rate', 'Num Layers', 'Dropout Rate', 'Hidden Size'])

# Define a function to train the model and save train accuracy
def train(g, model, dataset_name, gnn_name, activation_name, num_epochs, learning_rate, num_layers, dropout_rate, hidden_size, run_number):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    best_val_acc = 0
    best_test_acc = 0
    best_train_acc = 0

    features = g.ndata['feat'].to(device)
    labels = g.ndata['label'].to(device)
    train_mask = g.ndata['train_mask'].to(device)
    val_mask = g.ndata['val_mask'].to(device)
    test_mask = g.ndata['test_mask'].to(device)

    g = dgl.add_self_loop(g)
    results_dfs = []

    for epoch in range(num_epochs):
        start_time = time.time()

        logits = model(g.to(device), features)
        pred = logits.argmax(1)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        macro_precision, micro_precision, macro_recall, micro_recall, f_measure = calculate_metrics(pred[test_mask], labels[test_mask])

        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()

        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        if best_val_acc < val_acc:
            best_val_acc = val_acc

        if best_test_acc < test_acc:
            best_test_acc = test_acc

        if best_train_acc < train_acc:
            best_train_acc = train_acc    

        if isinstance(best_test_acc, int):
            best_test_acc_value = best_test_acc
        else:
            best_test_acc_value = best_test_acc.item()

        if isinstance(best_val_acc, int):
            best_val_acc_value = best_val_acc
        else:
            best_val_acc_value = best_val_acc.item()

        if isinstance(best_train_acc, int):
            best_train_acc_value = best_train_acc
        else:
            best_train_acc_value = best_train_acc.item()    

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        end_time = time.time()
        computation_time = end_time - start_time

        epoch_result = pd.DataFrame({
            'Run Number': [run_number],
            'Dataset': [dataset_name],
            'Activation Function': [activation_name],
            'GNN Model': [gnn_name],
            'Learning Rate': [learning_rate],
            'Num Layers': [num_layers],
            'Dropout Rate': [dropout_rate],
            'Hidden Size': [hidden_size],
            'Num Epochs': [num_epochs],
            'Epochs': [epoch + 1],
            'Loss': [loss.item()],
            'Train Acc': [train_acc.item()],
            'Best Train Acc': [best_train_acc_value],
            'Valid Acc': [val_acc.item()],
            'Best Valid Acc': [best_val_acc_value],
            'Test Acc': [test_acc.item()],
            'Best Test Acc': [best_test_acc_value],
            'Macro Precision': [macro_precision],
            'Micro Precision': [micro_precision],
            'Macro Recall': [macro_recall],
            'Micro Recall': [micro_recall],
            'F-measure': [f_measure],
            'Computation Time (seconds)': [computation_time]
        })
        results_dfs.append(epoch_result)

    print("Train Done")
    results_df = pd.concat(results_dfs, ignore_index=True)
    return results_df

# Function to create folders if they don't exist
def create_folders(*folders):
    for folder in folders:
        if not os.path.exists(folder):
            os.makedirs(folder)

# Function to save basic results to an Excel file with a formatted filename
def save_results_to_excel(data_name, gnn_name, activation_name, results_df):
    # Create folder structure
    result_folder = 'result_node_classification50'
    dataset_folder = os.path.join(result_folder, data_name)
    gnn_folder = os.path.join(dataset_folder, gnn_name)
    activation_folder = os.path.join(gnn_folder, activation_name)
    
    create_folders(result_folder, dataset_folder, gnn_folder, activation_folder)
    
    # Construct the file name
    file_name = f'{data_name}_{gnn_name}_{activation_name}.xlsx'

    # Save the DataFrame to the Excel file with the constructed file path
    file_path = os.path.join(activation_folder, file_name)
    results_df.to_excel(file_path, index=False)

# Function to save detailed results to an Excel file with a formatted filename
def save_detailed_results_to_excel(data_name, gnn_name, activation_name, num_epochs, learning_rate, num_layers, dropout_rate, hidden_size, results_df):
    # Create folder structure
    result_folder = 'result_node_classification50'
    dataset_folder = os.path.join(result_folder, data_name)
    gnn_folder = os.path.join(dataset_folder, gnn_name)
    activation_folder = os.path.join(gnn_folder, activation_name)
    
    create_folders(result_folder, dataset_folder, gnn_folder, activation_folder)
    
    # Construct the file name for detailed results
    file_name = f'{data_name}_{gnn_name}_{activation_name}_epochs{num_epochs}_lr{learning_rate}_layers{num_layers}_dropout{dropout_rate}_hidden{hidden_size}_detailed_results.xlsx'

    # Save the DataFrame to the Excel file with the constructed file path
    file_path = os.path.join(activation_folder, file_name)
    results_df.to_excel(file_path, index=False)

results = []
run_number = 1
for data_name in dataset_names:
    dataset = load_dataset(data_name)
    for gnn in gnn_models:
        for activation in activation_functions:
            results_data_gnn_activation = []
            for num_epochs in num_epochs_list:
                for learning_rate in learning_rates:
                    for num_layers in num_layers_list:
                        for dropout_rate in dropout_rates:
                            for hidden_size in hidden_sizes:
                                print(f'Run number: {run_number}\tDataset Name: {data_name}\tModel GNN: {gnn.__name__}\tActivation function: {activation.__name__}\tNum Epochs: {num_epochs}\tLearning Rate: {learning_rate}\tNum Layers: {num_layers}\tDropout Rate: {dropout_rate}\tHidden Size: {hidden_size}')
                                if data_name == 'ppi':
                                    dataset.num_classes = dataset.num_labels
                                elif data_name == 'mnist':
                                    g = dataset
                                    num_classes = dataset.num_classes
                                    out_feats = 128
                                elif data_name == 'nell':
                                    g = dataset
                                    num_classes = dataset.num_classes
                                    out_feats = num_classes
                                elif data_name == 'blogcatalog':
                                    g = dataset
                                    num_classes = g.num_classes
                                else:
                                    num_classes = dataset.num_classes
                                    out_feats = 128
                                    g = dataset[0]

#                                 print('Number of categories:', num_classes)

                                if (gnn == GraphSAGE) or (gnn == SGC):
                                    model = gnn(g.ndata['feat'].shape[1], num_classes, num_layers, activation, out_feats).to(device)
                                else:
                                    model = gnn(g.ndata['feat'].shape[1], hidden_size=hidden_size, num_classes=num_classes, num_layers=num_layers, dropout_rate=dropout_rate, activation=activation).to(device)
                                start_time1 = time.time()
                                print(start_time1)

                                result = train(g, model, data_name, gnn.__name__, activation.__name__, num_epochs, learning_rate, num_layers, dropout_rate, hidden_size, run_number)
                                end_time1 = time.time()
                                print(end_time1)
                                computation_time1 = end_time1 - start_time1
                                print(computation_time1)
                                train_df = pd.concat([result], ignore_index=True)
                                save_detailed_results_to_excel(data_name, gnn.__name__, activation.__name__, num_epochs, learning_rate, num_layers, dropout_rate, hidden_size, train_df)
                                results_data_gnn_activation.append(result)
                                results.append(result)
                                run_number=run_number+1
            results_data_gnn_activation_df = pd.concat(results_data_gnn_activation, ignore_index=True)                    
            save_results_to_excel(data_name, gnn.__name__, activation.__name__, results_data_gnn_activation_df)
            
results_df = pd.concat(results, ignore_index=True)

results_df.to_excel(f'result_node_classification50/results_node_classification_{dataset_names}.xlsx', index=False)


Run number: 1	Dataset Name: reddit	Model GNN: GAT	Activation function: tanh	Num Epochs: 10	Learning Rate: 0.01	Num Layers: 2	Dropout Rate: 0.1	Hidden Size: 16
1701233892.4068606
Train Done
1701234698.6821005
806.275239944458
Run number: 2	Dataset Name: reddit	Model GNN: GAT	Activation function: tanh	Num Epochs: 10	Learning Rate: 0.01	Num Layers: 2	Dropout Rate: 0.1	Hidden Size: 32
1701234698.709915
Train Done
1701235597.2422466
898.5323317050934
Run number: 3	Dataset Name: reddit	Model GNN: GAT	Activation function: tanh	Num Epochs: 10	Learning Rate: 0.01	Num Layers: 2	Dropout Rate: 0.05	Hidden Size: 16
1701235597.265659
Train Done
1701236358.5539277
761.2882685661316
Run number: 4	Dataset Name: reddit	Model GNN: GAT	Activation function: tanh	Num Epochs: 10	Learning Rate: 0.01	Num Layers: 2	Dropout Rate: 0.05	Hidden Size: 32
1701236358.5795617
Train Done
1701237230.685063
872.1055011749268
Run number: 5	Dataset Name: reddit	Model GNN: GAT	Activation function: tanh	Num Epochs: 10	Learnin

In [17]:
results_df

Unnamed: 0,Run Number,Dataset,Activation Function,GNN Model,Learning Rate,Num Layers,Dropout Rate,Hidden Size,Num Epochs,Epochs,...,Valid Acc,Best Valid Acc,Test Acc,Best Test Acc,Macro Precision,Micro Precision,Macro Recall,Micro Recall,F-measure,Computation Time (seconds)
0,1,reddit,tanh,GAT,0.01,2,0.10,16,10,1,...,0.063657,0.063657,0.063049,0.063049,0.031877,0.999999,0.033581,1.0,0.032707,90.963950
1,1,reddit,tanh,GAT,0.01,2,0.10,16,10,2,...,0.186648,0.186648,0.185179,0.185179,0.054037,1.000000,0.061255,1.0,0.057420,80.575576
2,1,reddit,tanh,GAT,0.01,2,0.10,16,10,3,...,0.208384,0.208384,0.207493,0.207493,0.106770,1.000000,0.070873,1.0,0.085195,79.265591
3,1,reddit,tanh,GAT,0.01,2,0.10,16,10,4,...,0.281860,0.281860,0.278369,0.278369,0.127036,1.000000,0.108272,1.0,0.116906,76.350540
4,1,reddit,tanh,GAT,0.01,2,0.10,16,10,5,...,0.284294,0.284294,0.279410,0.279410,0.154734,1.000000,0.121825,1.0,0.136322,75.722263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,8,reddit,tanh,GAT,0.01,4,0.05,32,10,6,...,0.398263,0.398263,0.398776,0.398776,0.132766,1.000000,0.130837,1.0,0.131795,187.085221
76,8,reddit,tanh,GAT,0.01,4,0.05,32,10,7,...,0.401704,0.401704,0.403748,0.403748,0.139734,1.000000,0.132241,1.0,0.135884,198.752966
77,8,reddit,tanh,GAT,0.01,4,0.05,32,10,8,...,0.416558,0.416558,0.418092,0.418092,0.139485,1.000000,0.151134,1.0,0.145076,200.294433
78,8,reddit,tanh,GAT,0.01,4,0.05,32,10,9,...,0.419244,0.419244,0.421683,0.421683,0.159475,1.000000,0.157293,1.0,0.158376,205.763353


In [22]:
!ls

redit_gat_tanh.zip  redit_sgc_tanh.zip	result_node_classification50


In [None]:
!zip -r redit_gat_tanh.zip /kaggle/working

In [19]:
!ls

redit_gat_tanh.zip  redit_sgc_tanh.zip	result_node_classification50


In [23]:
from IPython.display import FileLink
FileLink(r'redit_gat_tanh.zip')

