In [None]:
%matplotlib inline

In [None]:
# !pip install dgl
!pip install dgl==1.0.1
!pip install torch==2.0.0
!pip install node2vec
!pip install 'networkx<2.7'
!pip install 'scipy>=1.8'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
import dgl.function as fn
import itertools
import numpy as np
import scipy.sparse as sp

In [None]:
dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [None]:
u, v = g.edges()

In [None]:
edge_ids = np.arange(g.number_of_edges())
edge_ids = np.random.permutation(edge_ids)
test_size = int(len(edge_ids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[edge_ids[:test_size]], v[edge_ids[:test_size]]
train_pos_u, train_pos_v = u[edge_ids[test_size:]], v[edge_ids[test_size:]]

In [None]:
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_edge_ids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_edge_ids[:test_size]], neg_v[neg_edge_ids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_edge_ids[test_size:]], neg_v[neg_edge_ids[test_size:]]

In [None]:
train_g = dgl.remove_edges(g, edge_ids[:test_size])

In [None]:
def get_optimal_threshold(tpr, fpr, thresholds):
  g_means = tpr*(1-fpr)
  max_index = np.argmax(g_means)
  return thresholds[max_index]

In [None]:
def binarize_scores(scores, labels, optimal_threshold):
  final_preds = []
  for i in range(len(scores)):
    if(scores[i]<=optimal_threshold):
      final_preds.append(0)
    else:
      final_preds.append(1)
  return np.asarray(final_preds)

# Logistic Regression

In [None]:
# Define logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(in_feats, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)

    def forward(self, g, features):
        x = F.relu(self.linear(g, features))
        x = self.linear2(g, x)
        return x


# GCN

In [None]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, h_feats)
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
model = GCN(train_g.ndata['feat'].shape[1], 16)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
best_val_acc = 0
best_test_acc = 0

features = g.ndata["feat"]
labels = g.ndata["label"]
train_mask = g.ndata["train_mask"]
val_mask = g.ndata["val_mask"]
test_mask = g.ndata["test_mask"]

for e in range(100):
    # Forward
    logits = model(g, features)

    # Compute prediction
    pred = logits.argmax(1)

    # Compute loss
    # Note that you should only compute the losses of the nodes in the training set.
    loss = F.cross_entropy(logits[train_mask], labels[train_mask])

    # Compute accuracy on training/validation/test
    train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
    val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
    test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

    # Save the best validation accuracy and the corresponding test accuracy.
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc

    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print(
            "In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})".format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc
            )
        )

# Compute evaluation metrics on test data
logits_test = model(g, features)
pred_test = logits_test.argmax(1)

test_acc = accuracy_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy())
test_f1 = f1_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')
test_precision = precision_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')
test_recall = recall_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')



  assert input.numel() == input.storage().size(), (


In epoch 0, loss: 0.805, val acc: 0.760 (best 0.760), test acc: 0.741 (best 0.741)
In epoch 5, loss: 0.681, val acc: 0.774 (best 0.776), test acc: 0.756 (best 0.749)
In epoch 10, loss: 0.571, val acc: 0.774 (best 0.776), test acc: 0.766 (best 0.749)
In epoch 15, loss: 0.475, val acc: 0.776 (best 0.776), test acc: 0.770 (best 0.749)
In epoch 20, loss: 0.393, val acc: 0.780 (best 0.780), test acc: 0.777 (best 0.775)
In epoch 25, loss: 0.324, val acc: 0.778 (best 0.782), test acc: 0.777 (best 0.779)
In epoch 30, loss: 0.267, val acc: 0.776 (best 0.782), test acc: 0.778 (best 0.779)
In epoch 35, loss: 0.221, val acc: 0.772 (best 0.782), test acc: 0.782 (best 0.779)
In epoch 40, loss: 0.183, val acc: 0.776 (best 0.782), test acc: 0.779 (best 0.779)
In epoch 45, loss: 0.153, val acc: 0.782 (best 0.782), test acc: 0.777 (best 0.779)
In epoch 50, loss: 0.128, val acc: 0.776 (best 0.782), test acc: 0.778 (best 0.779)
In epoch 55, loss: 0.108, val acc: 0.772 (best 0.782), test acc: 0.779 (best 0

In [None]:
print("Test accuracy: {:.3f}".format(test_acc))
print("Test F-score: {:.3f}".format(test_f1))
print("Test Precision: {:.3f}".format(test_precision))
print("Test Recall: {:.3f}".format(test_recall))

Test accuracy: 0.770
Test F-score: 0.770
Test Precision: 0.783
Test Recall: 0.770


# GraphSAGE

In [None]:
import torch.nn.functional as F
from dgl.nn import SAGEConv, GATConv

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


In [None]:

model = GraphSAGE(train_g.ndata['feat'].shape[1], 32)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

best_val_acc = 0
best_test_acc = 0

features = g.ndata["feat"]
labels = g.ndata["label"]
train_mask = g.ndata["train_mask"]
val_mask = g.ndata["val_mask"]
test_mask = g.ndata["test_mask"]

for e in range(100):
    # Forward
    logits = model(g, features)

    # Compute prediction
    pred = logits.argmax(1)

    # Compute loss
    # Note that you should only compute the losses of the nodes in the training set.
    loss = F.cross_entropy(logits[train_mask], labels[train_mask])

    # Compute accuracy on training/validation/test
    train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
    val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
    test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

    # Save the best validation accuracy and the corresponding test accuracy.
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc

    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e %5 == 0:
        print(
            "In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})".format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc
            )
        )

# Compute evaluation metrics on test data
logits_test = model(g, features)
pred_test = logits_test.argmax(1)

test_acc = accuracy_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy())
test_f1 = f1_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')
test_precision = precision_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')
test_recall = recall_score(labels[test_mask].cpu().numpy(), pred_test[test_mask].cpu().numpy(), average='weighted')



  assert input.numel() == input.storage().size(), (


In epoch 0, loss: 3.437, val acc: 0.000 (best 0.000), test acc: 0.003 (best 0.000)
In epoch 5, loss: 2.656, val acc: 0.170 (best 0.170), test acc: 0.194 (best 0.194)
In epoch 10, loss: 1.687, val acc: 0.226 (best 0.226), test acc: 0.247 (best 0.247)
In epoch 15, loss: 1.137, val acc: 0.592 (best 0.592), test acc: 0.565 (best 0.565)
In epoch 20, loss: 0.757, val acc: 0.744 (best 0.744), test acc: 0.700 (best 0.700)
In epoch 25, loss: 0.449, val acc: 0.750 (best 0.754), test acc: 0.716 (best 0.701)
In epoch 30, loss: 0.246, val acc: 0.736 (best 0.754), test acc: 0.709 (best 0.701)
In epoch 35, loss: 0.130, val acc: 0.752 (best 0.756), test acc: 0.744 (best 0.738)
In epoch 40, loss: 0.071, val acc: 0.756 (best 0.756), test acc: 0.747 (best 0.738)
In epoch 45, loss: 0.041, val acc: 0.748 (best 0.756), test acc: 0.741 (best 0.738)
In epoch 50, loss: 0.026, val acc: 0.748 (best 0.756), test acc: 0.746 (best 0.738)
In epoch 55, loss: 0.018, val acc: 0.748 (best 0.756), test acc: 0.754 (best 0

In [None]:
print("Test accuracy: {:.3f}".format(test_acc))
print("Test F-score: {:.3f}".format(test_f1))
print("Test Precision: {:.3f}".format(test_precision))
print("Test Recall: {:.3f}".format(test_recall))

Test accuracy: 0.761
Test F-score: 0.761
Test Precision: 0.784
Test Recall: 0.761
