# CS 249 Project GMAERF

## This portion runs the Graph Autoencoder on Cora dataset

In [1]:
## import and setups

import time
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt

import torch
from torch import optim
import torch.nn.functional as F

from gae.model import GVAE
from gae.optimizer import loss_function
import gae.utils

from sklearn.metrics.pairwise import cosine_similarity, paired_distances
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import cosine

%matplotlib inline

%load_ext autoreload
%autoreload 2

args = {
  'dataset': 'cora',
  'epochs': 200,
  'h1_dim': 32,
  'h2_dim': 16,
  'lr': 1e-2,
  'weight_decay': 5e-4,
  # 'weight_decay': 0,
  'dropout': 0,
  'target': 'feat'
}

In [2]:
# print(f"using {args['dataset']} dataset")

# preprocessing
adj, features = gae.utils.load_data(args['dataset'])
n_nodes, feat_dim = features.shape
# print(f"adj dim: {adj.shape}")
# print(adj)
# print(f"fea dim: {features.shape}")
# print(features)

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = gae.utils.mask_test_edges(adj)
adj = adj_train

adj_norm = gae.utils.preprocess_graph(adj)
adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = torch.FloatTensor(adj_label.toarray())

if args['target'] == 'adj':
    pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
elif args['target'] == 'feat':
    pos_weight = torch.Tensor([float(features.shape[0] * features.shape[0] - features.sum()) / features.sum()])
    norm = features.shape[0] * features.shape[0] / float((features.shape[0] * features.shape[0] - features.sum()) * 2)


In [3]:
## training

model = GVAE(feat_dim, args['h1_dim'], args['h2_dim'], args['dropout'], target=args['target'])
optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])

hidden_emb = None
for epoch in range(args['epochs']):
  t = time.time()
  model.train()
  optimizer.zero_grad()
  recovered, mu, logvar = model(features, adj_norm)
  if args['target'] == 'adj':
    labels = adj_label
  elif args['target'] == 'feat':
    labels = features
  loss = loss_function(preds=recovered, labels=labels,
                       mu=mu, logvar=logvar, n_nodes=n_nodes,
                       norm=norm, pos_weight=pos_weight,
                       target=args['target'])
  loss.backward()
  cur_loss = loss.item()
  optimizer.step()

  hidden_emb = mu.data.numpy()

  metric = 'cosine'

  if args['target'] == 'adj':
    roc_curr, ap_curr = gae.utils.get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
    sim_score = (paired_distances(recovered.detach().numpy(), labels.numpy(), metric=metric)).mean()
    print(f"Epoch{(epoch+1):4}:", f"train_loss={cur_loss:.5f}",
          f"val_ap={ap_curr:.5f}", f"sim_score={sim_score:.5f}",
          f"time={(time.time()-t):.5f}")
  elif args['target'] == 'feat':
    sim_score = (paired_distances(recovered.detach().numpy(), labels.numpy(), metric=metric)).mean()
    print(f"Epoch{(epoch+1):4}:", f"train_loss={cur_loss:.5f}",
          f"sim_score={sim_score:.5f}", f"time={(time.time()-t):.5f}")

Epoch   1: train_loss=1.11818 sim_score=0.93681 time=0.19903
Epoch   2: train_loss=1.07703 sim_score=0.93448 time=0.19431
Epoch   3: train_loss=1.00149 sim_score=0.93151 time=0.20092
Epoch   4: train_loss=0.97221 sim_score=0.92713 time=0.19205
Epoch   5: train_loss=0.96356 sim_score=0.92440 time=0.19379
Epoch   6: train_loss=0.95985 sim_score=0.92036 time=0.19314
Epoch   7: train_loss=0.95784 sim_score=0.91679 time=0.19209
Epoch   8: train_loss=0.95467 sim_score=0.91244 time=0.19103
Epoch   9: train_loss=0.95131 sim_score=0.90713 time=0.19700
Epoch  10: train_loss=0.95005 sim_score=0.90399 time=0.19299
Epoch  11: train_loss=0.94674 sim_score=0.89783 time=0.19453
Epoch  12: train_loss=0.94418 sim_score=0.89330 time=0.19314
Epoch  13: train_loss=0.94281 sim_score=0.88918 time=0.19750
Epoch  14: train_loss=0.94116 sim_score=0.88567 time=0.19113
Epoch  15: train_loss=0.93808 sim_score=0.87975 time=0.19338
Epoch  16: train_loss=0.93618 sim_score=0.87658 time=0.19210
Epoch  17: train_loss=0.

In [4]:
## validate

roc_score, ap_score = gae.utils.get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
print('Test ROC score: ' + str(roc_score))
print('Test AP score: ' + str(ap_score))

npemb = hidden_emb.detach().numpy()
print(npemb.shape)
np.savetxt('hidden_emb_gvae.content', npemb)

Test ROC score: 0.5
Test AP score: 0.5


AttributeError: 'numpy.ndarray' object has no attribute 'detach'

## This portion runs the GCN on cora dataset

In [None]:
# import and setups

# from gcn.models import GCN
# import gcn.utils

# args = {
#   'dataset': 'cora',
#   'epochs': 200,
#   'hidden_dim': 16,
#   'lr': 1e-2,
#   'weight_decay': 5e-4,
#   'dropout': 0.5
# }

In [None]:
# Load data
# adj, features, labels, idx_train, idx_val, idx_test = gcn.utils.load_data()
# n_nodes, feat_dim = features.shape

# # Model and optimizer
# model = GCN(nfeat=feat_dim,
#             nhid=args['hidden_dim'],
#             nclass=labels.max().item() + 1,
#             dropout=args['dropout'])
# optimizer = optim.Adam(model.parameters(),
#                        lr=args['lr'],
#                        weight_decay=args['weight_decay'])

In [None]:
# training

# t_total = time.time()

# for epoch in range(args['epochs']):
#   t = time.time()
#   model.train()
#   optimizer.zero_grad()
#   output = model(features, adj)
#   loss_train = F.nll_loss(output[idx_train], labels[idx_train])
#   acc_train = gcn.utils.accuracy(output[idx_train], labels[idx_train])
#   loss_train.backward()
#   optimizer.step()

#   loss_val = F.nll_loss(output[idx_val], labels[idx_val])
#   acc_val = gcn.utils.accuracy(output[idx_val], labels[idx_val])
#   print(f'Epoch: {(epoch+1):04d}',
#         f'loss_train: {loss_train.item():.4f}',
#         f'acc_train: {acc_train.item():.4f}',
#         f'loss_val: {loss_val.item():.4f}',
#         f'acc_val: {acc_val.item():.4f}',
#         f'time: {(time.time() - t):.4f}s')

# npemb = model.hidden_emb.detach().numpy()
# print(npemb.shape)
# np.savetxt('hidden_emb.content', npemb)

# print("Optimization Finished!")
# print(f"Total time elapsed: {time.time() - t_total:.4f}s")

In [None]:
# testing

# model.eval()
# output = model(features, adj)
# loss_test = F.nll_loss(output[idx_test], labels[idx_test])
# acc_test = gcn.utils.accuracy(output[idx_test], labels[idx_test])
# print(f"Test set results:",
#       f"loss= {loss_test.item():.4f}",
#       f"accuracy= {acc_test.item():.4f}")