# CS 249 Project GMAERF

## This portion runs the Graph Autoencoder on Cora dataset

In [1]:
## import and setups

import time
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt

import torch
from torch import optim
import torch.nn.functional as F

from gae.model import GVAE
from gae.optimizer import loss_function
import gae.utils

%matplotlib inline

%load_ext autoreload
%autoreload 2

args = {
  'dataset': 'cora',
  'epochs': 200,
  'h1_dim': 32,
  'h2_dim': 16,
  'lr': 1e-2,
  'dropout': 0
}

In [2]:
print(f"using {args['dataset']} dataset")

## preprocessing
adj, features = gae.utils.load_data(args['dataset'])
n_nodes, feat_dim = features.shape
print(f"adj dim: {adj.shape}")
print(f"fea dim: {features.shape}")

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = gae.utils.mask_test_edges(adj)
adj = adj_train

adj_norm = gae.utils.preprocess_graph(adj)
adj_label = adj_train + sp.eye(adj_train.shape[0])
# adj_label = sparse_to_tuple(adj_label)
adj_label = torch.FloatTensor(adj_label.toarray())

pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

using cora dataset
adj dim: (2708, 2708)
fea dim: torch.Size([2708, 1433])


In [3]:
## training

model = GVAE(feat_dim, args['h1_dim'], args['h2_dim'], args['dropout'])
optimizer = optim.Adam(model.parameters(), lr=args['lr'])

hidden_emb = None
for epoch in range(args['epochs']):
  t = time.time()
  model.train()
  optimizer.zero_grad()
  recovered, mu, logvar = model(features, adj_norm)
  loss = loss_function(preds=recovered, labels=adj_label,
                       mu=mu, logvar=logvar, n_nodes=n_nodes,
                       norm=norm, pos_weight=pos_weight)
  loss.backward()
  cur_loss = loss.item()
  optimizer.step()

  hidden_emb = mu.data.numpy()
  roc_curr, ap_curr = gae.utils.get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)

  print(f"Epoch{(epoch+1):4}:", f"train_loss={cur_loss:.5f}",
        f"val_ap={ap_curr:.5f}", f"time={(time.time()-t):.5f}")

Epoch   1: train_loss=1.80811 val_ap=0.70124 time=0.33693
Epoch   2: train_loss=1.62555 val_ap=0.69415 time=0.33081
Epoch   3: train_loss=1.45576 val_ap=0.68396 time=0.32325
Epoch   4: train_loss=1.32817 val_ap=0.68144 time=0.32389
Epoch   5: train_loss=1.24437 val_ap=0.68461 time=0.33732
Epoch   6: train_loss=1.15958 val_ap=0.68793 time=0.32505
Epoch   7: train_loss=1.09856 val_ap=0.69191 time=0.31539
Epoch   8: train_loss=1.00455 val_ap=0.69608 time=0.31305
Epoch   9: train_loss=0.92773 val_ap=0.70355 time=0.32709
Epoch  10: train_loss=0.87715 val_ap=0.71287 time=0.35105
Epoch  11: train_loss=0.81845 val_ap=0.72329 time=0.32789
Epoch  12: train_loss=0.76881 val_ap=0.73441 time=0.31437
Epoch  13: train_loss=0.74456 val_ap=0.74372 time=0.31256
Epoch  14: train_loss=0.71742 val_ap=0.75469 time=0.31542
Epoch  15: train_loss=0.70370 val_ap=0.76707 time=0.31032
Epoch  16: train_loss=0.68860 val_ap=0.77752 time=0.31750
Epoch  17: train_loss=0.67489 val_ap=0.78292 time=0.33153
Epoch  18: tra

In [4]:
## validate

roc_score, ap_score = gae.utils.get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
print('Test ROC score: ' + str(roc_score))
print('Test AP score: ' + str(ap_score))

Test ROC score: 0.9089544123948166
Test AP score: 0.919008859720803


This portion runs the GCN on cora dataset

In [5]:
# import and setups

from gcn.models import GCN
import gcn.utils

args = {
  'dataset': 'cora',
  'epochs': 200,
  'hidden_dim': 16,
  'lr': 1e-2,
  'weight_decay': 5e-4,
  'dropout': 0.5
}

In [10]:
# Load data
adj, features, labels, idx_train, idx_val, idx_test = gcn.utils.load_data()
n_nodes, feat_dim = features.shape

# Model and optimizer
model = GCN(nfeat=feat_dim,
            nhid=args['hidden_dim'],
            nclass=labels.max().item() + 1,
            dropout=args['dropout'])
optimizer = optim.Adam(model.parameters(),
                       lr=args['lr'],
                       weight_decay=args['weight_decay'])

Loading cora dataset...


In [25]:
# training

t_total = time.time()

for epoch in range(args['epochs']):
  t = time.time()
  model.train()
  optimizer.zero_grad()
  output = model(features, adj)
  loss_train = F.nll_loss(output[idx_train], labels[idx_train])
  acc_train = gcn.utils.accuracy(output[idx_train], labels[idx_train])
  loss_train.backward()
  optimizer.step()

  loss_val = F.nll_loss(output[idx_val], labels[idx_val])
  acc_val = gcn.utils.accuracy(output[idx_val], labels[idx_val])
  print(f'Epoch: {(epoch+1):04d}',
        f'loss_train: {loss_train.item():.4f}',
        f'acc_train: {acc_train.item():.4f}',
        f'loss_val: {loss_val.item():.4f}',
        f'acc_val: {acc_val.item():.4f}',
        f'time: {(time.time() - t):.4f}s')

npemb = model.hidden_emb.detach().numpy()
print(npemb.shape)
np.savetxt('hidden_emb.content', npemb)

print("Optimization Finished!")
print(f"Total time elapsed: {time.time() - t_total:.4f}s")

Epoch: 0001 loss_train: 0.1906 acc_train: 0.9786 loss_val: 0.6675 acc_val: 0.7833 time: 0.0087s
Epoch: 0002 loss_train: 0.1842 acc_train: 0.9857 loss_val: 0.7401 acc_val: 0.7667 time: 0.0086s
Epoch: 0003 loss_train: 0.2086 acc_train: 0.9786 loss_val: 0.7329 acc_val: 0.7900 time: 0.0078s
Epoch: 0004 loss_train: 0.1975 acc_train: 0.9786 loss_val: 0.7111 acc_val: 0.7800 time: 0.0085s
Epoch: 0005 loss_train: 0.1859 acc_train: 0.9714 loss_val: 0.6628 acc_val: 0.7967 time: 0.0079s
Epoch: 0006 loss_train: 0.1984 acc_train: 0.9929 loss_val: 0.6811 acc_val: 0.8000 time: 0.0081s
Epoch: 0007 loss_train: 0.1917 acc_train: 0.9857 loss_val: 0.6990 acc_val: 0.7600 time: 0.0076s
Epoch: 0008 loss_train: 0.2093 acc_train: 0.9786 loss_val: 0.6491 acc_val: 0.7900 time: 0.0083s
Epoch: 0009 loss_train: 0.1801 acc_train: 0.9857 loss_val: 0.6979 acc_val: 0.7767 time: 0.0075s
Epoch: 0010 loss_train: 0.1952 acc_train: 0.9857 loss_val: 0.7017 acc_val: 0.7767 time: 0.0082s
Epoch: 0011 loss_train: 0.1998 acc_train

In [17]:
# testing

model.eval()
output = model(features, adj)
loss_test = F.nll_loss(output[idx_test], labels[idx_test])
acc_test = gcn.utils.accuracy(output[idx_test], labels[idx_test])
print(f"Test set results:",
      f"loss= {loss_test.item():.4f}",
      f"accuracy= {acc_test.item():.4f}")

Test set results: loss= 0.6041 accuracy= 0.8320
