# CS 249 Project GMAERF

## This portion runs the Graph Autoencoder on Cora dataset

In [1]:
## import and setups

import time
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt

import torch
from torch import optim
import torch.nn.functional as F

from gae.model import GVAE
from gae.optimizer import loss_function
import gae.utils

from sklearn.metrics.pairwise import paired_distances
from sklearn.metrics import confusion_matrix

%matplotlib inline

%load_ext autoreload
%autoreload 2

args = {
  'dataset': 'cora',
  'epochs': 200,
  'h1_dim': 16,
  'h2_dim': 8,
  'lr': 1e-2,
  'weight_decay': 5e-4,
  # 'weight_decay': 0,
  'dropout': 0,
  'target': 'feat'
}

In [2]:
# print(f"using {args['dataset']} dataset")

# preprocessing
adj, features = gae.utils.load_data(args['dataset'])
n_nodes, feat_dim = features.shape
# print(f"adj dim: {adj.shape}")
# print(adj)
# print(f"fea dim: {features.shape}")
# print(features)

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) 
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = gae.utils.mask_test_edges(adj)
adj = adj_train

adj_norm = gae.utils.preprocess_graph(adj)
adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = torch.FloatTensor(adj_label.toarray())

if args['target'] == 'adj':
    pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
elif args['target'] == 'feat':
    pos_weight = torch.Tensor([float(features.shape[0] * features.shape[0] - features.sum()) / features.sum()])
    norm = features.shape[0] * features.shape[0] / float((features.shape[0] * features.shape[0] - features.sum()) * 2)


In [3]:
## training

model = GVAE(feat_dim, args['h1_dim'], args['h2_dim'], args['dropout'], target=args['target'])
optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])

hidden_emb = None
for epoch in range(args['epochs']):
  t = time.time()
  model.train()
  optimizer.zero_grad()
  recovered, mu, logvar = model(features, adj_norm)
  if args['target'] == 'adj':
    labels = adj_label
  elif args['target'] == 'feat':
    labels = features
  loss = loss_function(preds=recovered, labels=labels,
                       mu=mu, logvar=logvar, n_nodes=n_nodes,
                       norm=norm, pos_weight=pos_weight,
                       target=args['target'])
  loss.backward()
  cur_loss = loss.item()
  optimizer.step()

  hidden_emb = mu.data.numpy()

  metric = 'cosine'

  if args['target'] == 'adj':
    roc_curr, ap_curr = gae.utils.get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
    sim_score = (paired_distances(recovered.detach().numpy(), labels.numpy(), metric=metric)).mean()
    preds = torch.gt(torch.sigmoid(recovered), 0.5).int()
    labels = labels.int()
    acc = torch.mean(torch.eq(preds, labels).float())
    tp = torch.nonzero(preds * labels).size(0)
    fp = torch.nonzero(preds * (labels - 1)).size(0)
    fn = torch.nonzero((preds - 1) * labels).size(0)
    tn = torch.nonzero((preds - 1) * (labels - 1)).size(0)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print(f"Epoch{(epoch+1):4}:", f"train_loss={cur_loss:.5f}",
          f"val_ap={ap_curr:.5f}", f"sim_score={sim_score:.5f}",
          f"time={(time.time()-t):.5f}", f"acc={acc:.5f}", f"tp={tp}", 
          f"fp={fp}", f"fn={fn}", f"tn={tn}", f"precision={precision:.5f}", 
          f"recall={recall:.5f}")
  elif args['target'] == 'feat':
    sim_score = (paired_distances(recovered.detach().numpy(), labels.numpy(), metric=metric)).mean()
    preds = torch.gt(torch.sigmoid(recovered), 0.5).int()
    labels = labels.int()
    acc = torch.mean(torch.eq(preds, labels).float())
    tp = torch.nonzero(preds * labels).size(0)
    fp = torch.nonzero(preds * (labels - 1)).size(0)
    fn = torch.nonzero((preds - 1) * labels).size(0)
    tn = torch.nonzero((preds - 1) * (labels - 1)).size(0)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print(f"Epoch{(epoch+1):4}:", f"train_loss={cur_loss:.5f}",
          f"sim_score={sim_score:.5f}", f"time={(time.time()-t):.5f}",
          f"acc={acc:.5f}", f"tp={tp}", f"fp={fp}", f"fn={fn}", f"tn={tn}",
          f"precision={precision:.5f}", f"recall={recall:.5f}")

 train_loss=1.36161 sim_score=0.86486 time=0.34486 acc=0.60340 tp=42495 fp=1532313 fn=6721 tn=2299035 precision=0.02698 recall=0.86344
Epoch  64: train_loss=1.35965 sim_score=0.86613 time=0.36659 acc=0.60215 tp=42542 fp=1537227 fn=6674 tn=2294121 precision=0.02693 recall=0.86439
Epoch  65: train_loss=1.35679 sim_score=0.86675 time=0.37055 acc=0.59961 tp=42651 fp=1547170 fn=6565 tn=2284178 precision=0.02683 recall=0.86661
Epoch  66: train_loss=1.35392 sim_score=0.86689 time=0.35653 acc=0.59656 tp=42780 fp=1559146 fn=6436 tn=2272202 precision=0.02671 recall=0.86923
Epoch  67: train_loss=1.35140 sim_score=0.86619 time=0.34442 acc=0.59312 tp=42925 fp=1572644 fn=6291 tn=2258704 precision=0.02657 recall=0.87218
Epoch  68: train_loss=1.34869 sim_score=0.86560 time=0.34992 acc=0.59148 tp=43035 fp=1579122 fn=6181 tn=2252226 precision=0.02653 recall=0.87441
Epoch  69: train_loss=1.34539 sim_score=0.86527 time=0.35170 acc=0.59155 tp=43108 fp=1578906 fn=6108 tn=2252442 precision=0.02658 recall=0.8

In [4]:
## validate

# roc_score, ap_score = gae.utils.get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
# print('Test ROC score: ' + str(roc_score))
# print('Test AP score: ' + str(ap_score))

papers = np.genfromtxt(f"data/cora.content", dtype=np.dtype(str))
# print(papers[:,0][:,np.newaxis])

# print(hidden_emb)
# print(papers[:,0][:,np.newaxis].astype(str))
# print(papers[:,-1][:,np.newaxis].astype(str))
X_train = hidden_emb
hidden_emb = torch.gt(torch.sigmoid(torch.from_numpy(hidden_emb.astype(float))), 0.5).int().numpy()
hidden_emb = np.append(papers[:,0][:,np.newaxis].astype(str), hidden_emb.astype(str), axis=1)
hidden_emb = np.append(hidden_emb.astype(str), papers[:,-1][:,np.newaxis].astype(str), axis=1)
print(hidden_emb)
y_train = papers[:,-1][:,np.newaxis].astype(str)

np.savetxt('hidden_emb_gvae.content', hidden_emb, fmt="%s")

[['31336' '1' '0' ... '1' '1' 'Neural_Networks']
 ['1061127' '0' '1' ... '1' '1' 'Rule_Learning']
 ['1106406' '0' '0' ... '1' '1' 'Reinforcement_Learning']
 ...
 ['1128978' '0' '0' ... '0' '1' 'Genetic_Algorithms']
 ['117328' '0' '0' ... '1' '0' 'Case_Based']
 ['24043' '1' '0' ... '1' '0' 'Neural_Networks']]


In [5]:
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.preprocessing import LabelEncoder

classifier = SGDClassifier(verbose=1, max_iter=1000)
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)

classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)
print(sum(classifier.predict(X_train) == y_train) / y_train.shape[0])

5824, Avg. loss: 0.939189
Total training time: 0.01 seconds.
-- Epoch 29
Norm: 7.46, NNZs: 8, Bias: -3.491682, T: 78532, Avg. loss: 0.899884
Total training time: 0.01 seconds.
-- Epoch 30
Norm: 7.21, NNZs: 8, Bias: -3.255640, T: 81240, Avg. loss: 0.860862
Total training time: 0.01 seconds.
-- Epoch 31
Norm: 7.06, NNZs: 8, Bias: -3.740306, T: 83948, Avg. loss: 0.862829
Total training time: 0.01 seconds.
-- Epoch 32
Norm: 6.80, NNZs: 8, Bias: -3.506113, T: 86656, Avg. loss: 0.844122
Total training time: 0.01 seconds.
-- Epoch 33
Norm: 6.78, NNZs: 8, Bias: -4.172225, T: 89364, Avg. loss: 0.818848
Total training time: 0.01 seconds.
-- Epoch 34
Norm: 6.28, NNZs: 8, Bias: -3.948644, T: 92072, Avg. loss: 0.798653
Total training time: 0.01 seconds.
-- Epoch 35
Norm: 6.15, NNZs: 8, Bias: -3.734033, T: 94780, Avg. loss: 0.782178
Total training time: 0.01 seconds.
-- Epoch 36
Norm: 6.03, NNZs: 8, Bias: -3.734299, T: 97488, Avg. loss: 0.734768
Total training time: 0.01 seconds.
-- Epoch 37
Norm: 5

## This portion runs the GCN on cora dataset

In [6]:
# import and setups

# from gcn.models import GCN
# import gcn.utils

# args = {
#   'dataset': 'cora',
#   'epochs': 200,
#   'hidden_dim': 16,
#   'lr': 1e-2,
#   'weight_decay': 5e-4,
#   'dropout': 0.5
# }

In [7]:
# Load data
# adj, features, labels, idx_train, idx_val, idx_test = gcn.utils.load_data()
# n_nodes, feat_dim = features.shape

# # Model and optimizer
# model = GCN(nfeat=feat_dim,
#             nhid=args['hidden_dim'],
#             nclass=labels.max().item() + 1,
#             dropout=args['dropout'])
# optimizer = optim.Adam(model.parameters(),
#                        lr=args['lr'],
#                        weight_decay=args['weight_decay'])

In [8]:
# training

# t_total = time.time()

# for epoch in range(args['epochs']):
#   t = time.time()
#   model.train()
#   optimizer.zero_grad()
#   output = model(features, adj)
#   loss_train = F.nll_loss(output[idx_train], labels[idx_train])
#   acc_train = gcn.utils.accuracy(output[idx_train], labels[idx_train])
#   loss_train.backward()
#   optimizer.step()

#   loss_val = F.nll_loss(output[idx_val], labels[idx_val])
#   acc_val = gcn.utils.accuracy(output[idx_val], labels[idx_val])
#   print(f'Epoch: {(epoch+1):04d}',
#         f'loss_train: {loss_train.item():.4f}',
#         f'acc_train: {acc_train.item():.4f}',
#         f'loss_val: {loss_val.item():.4f}',
#         f'acc_val: {acc_val.item():.4f}',
#         f'time: {(time.time() - t):.4f}s')

# npemb = model.hidden_emb.detach().numpy()
# print(npemb.shape)
# np.savetxt('hidden_emb.content', npemb)

# print("Optimization Finished!")
# print(f"Total time elapsed: {time.time() - t_total:.4f}s")

In [9]:
# testing

# model.eval()
# output = model(features, adj)
# loss_test = F.nll_loss(output[idx_test], labels[idx_test])
# acc_test = gcn.utils.accuracy(output[idx_test], labels[idx_test])
# print(f"Test set results:",
#       f"loss= {loss_test.item():.4f}",
#       f"accuracy= {acc_test.item():.4f}")