<a href="https://colab.research.google.com/github/juanferEspinosa/Graph-Analytics/blob/main/Simple_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.9.0+cu111.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.9.0+cu111.html
Looking in links: https://data.pyg.org/whl/torch-1.9.0+cu111.html


In [None]:
import torch
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import torch.nn as nn
from torch.nn.modules.module import Module
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR,StepLR

import numpy as np
import pandas as pd
import pickle as pkl
import sys
import networkx as nx
import scipy.sparse as sp
import math
import matplotlib.pyplot as plt
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Utils functions: visualization

In [None]:
def visualize(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])

    if torch.is_tensor(h):
        h = h.detach().cpu().numpy()
        plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
        if epoch is not None and loss is not None:
            plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    else:
        nx.draw_networkx(h, pos=nx.spring_layout(h, seed=42), with_labels=False,
                         node_color=color, cmap="Set2")
    plt.show()

# Preprocessing: Importing datasets

Importing the datasets, split into training, validation and testing, normalizing it, getting the adjacency matrix, the scattering matrices, features matrix, index of nodes.

In [None]:
def normalize_adjacency_matrix(A, I):
  """
  Creating a normalized adjacency matrix with self loops.
  :param A: Sparse adjacency matrix.
  :param I: Identity matrix.
  :return A_tile_hat: Normalized adjacency matrix."""
  
  A_tilde = I
  degrees = A_tilde.sum(axis=0)[0].tolist()
  D = sp.diags(degrees, [0])
  D = D.power(-0.5)
  A_tilde_hat = D.dot(A_tilde).dot(D)
  return A_tilde_hat

def normalize(mx):
  """Row-normalize sparse matrix ---> Node features"""
  rowsum = np.array(mx.sum(1))
  r_inv = np.power(rowsum, -1).flatten()
  r_inv[np.isinf(r_inv)] = 0.
  r_mat_inv = sp.diags(r_inv)
  mx = r_mat_inv.dot(mx)
  return mx

def normalizemx(mx):
  """Normalization for Scattering GCN"""
  degrees = mx.sum(axis=0)[0].tolist()
  #    print(degrees)
  D = sp.diags(degrees, [0])
  D = D.power(-1)
  mx = mx.dot(D)
  return mx


def scattering1st(spmx,order):

  I_n = sp.eye(spmx.shape[0])
  adj_sct = 0.5*(spmx+I_n) # P = 1/2 * (I + WD^-1)
  adj_power = adj_sct
  adj_power = sparse_mx_to_torch_sparse_tensor(adj_power).cuda()
  adj_sct = sparse_mx_to_torch_sparse_tensor(adj_sct).cuda()
  I_n = sparse_mx_to_torch_sparse_tensor(I_n)
  if order>1:
    for i in range(order-1):
      # Generating P^(2^(k-1))
      adj_power = torch.spmm(adj_power,adj_sct.to_dense())
      print('Generating SCT')
    # Generating. final scattering of order K -> (I - P^(2^(k-1))) * P^(2^(k-1))
    adj_int = torch.spmm((adj_power-I_n.cuda()),adj_power)
  else:
    # Generating. final scattering of order K -> (I - P^(2^(k-1))) * P^(2^(k-1))
    adj_int = torch.spmm((adj_power-I_n.cuda()),adj_power.to_dense())
  return adj_int


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
  """Convert a scipy sparse matrix to a torch sparse tensor."""
  sparse_mx = sparse_mx.tocoo().astype(np.float32)
  indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
  values = torch.from_numpy(sparse_mx.data)
  shape = torch.Size(sparse_mx.shape)
  return torch.sparse.FloatTensor(indices, values, shape)


def parse_index_file(filename):
  #Parse index file.
  index = []
  for line in open(filename):
      index.append(int(line.strip()))
  return index

def accuracy(output, labels):
  preds = output.max(1)[1].type_as(labels)
  correct = preds.eq(labels).double()
  correct = correct.sum()
  return correct / len(labels)

In [None]:
def load_citation(dataset_str="citeseer", normalization="AugNormAdj", cuda=True):
  """  
  Load Citation Networks Datasets.
  """
  names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
  objects = []
  for i in range(len(names)):
    with open("/content/drive/MyDrive/THESIS/Databases/data/ind.{}.{}".format(dataset_str.lower(), names[i]), 'rb') as f:
      if sys.version_info > (3, 0):
          objects.append(pkl.load(f, encoding='latin1'))
      else:
          objects.append(pkl.load(f))

  x, y, tx, ty, allx, ally, graph = tuple(objects)
  test_idx_reorder = parse_index_file("/content/drive/MyDrive/THESIS/Databases/data/ind.{}.test.index".format(dataset_str))
  test_idx_range = np.sort(test_idx_reorder)
  if dataset_str == 'citeseer':
    # Fix citeseer dataset (there are some isolated nodes in the graph)
    # Find isolated nodes, add them as zero-vecs into the right position
    test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
    tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
    tx_extended[test_idx_range-min(test_idx_range), :] = tx
    tx = tx_extended
    ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
    ty_extended[test_idx_range-min(test_idx_range), :] = ty
    ty = ty_extended

  features = sp.vstack((allx, tx)).tolil()
  features[test_idx_reorder, :] = features[test_idx_range, :]
  adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
  adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
  labels = np.vstack((ally, ty))
  labels[test_idx_reorder, :] = labels[test_idx_range, :]


  idx_test = test_idx_range.tolist()
  idx_train = range(len(y))
  idx_val = range(len(y), len(y)+500)

  #   take from https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py
  #    idx_train = range(140)
  #    idx_val = range(200, 500)
  #    idx_test = range(500, 1500)


  labels = torch.LongTensor(labels)
  labels = torch.max(labels, dim=1)[1]
  idx_train = torch.LongTensor(idx_train)
  idx_val = torch.LongTensor(idx_val)
  idx_test = torch.LongTensor(idx_test)

  features = normalize(features)
  A_tilde = normalize_adjacency_matrix(adj,sp.eye(adj.shape[0]))
  adj = normalizemx(adj)
  
  print('Loading')
  #adj_sct1 = scattering1st(adj,1) ## psi_1 = P(I-P)
  #adj_sct2 = scattering1st(adj,2) # psi_2 = P^2(I-P^2)
  #adj_sct4 = scattering1st(adj,4) # psi_3 = P^4(I-P^4)
  adj = sparse_mx_to_torch_sparse_tensor(adj)
  A_tilde = sparse_mx_to_torch_sparse_tensor(A_tilde)
  features = torch.FloatTensor(np.array(features.todense()))
  return features, adj, A_tilde, labels, idx_train, idx_val, idx_test


In [None]:
features, adj,A_tilde, labels, idx_train, idx_val, idx_test = load_citation()

Loading




# MODELS

First the convolutional structure is defined to finally being called in a nn Module. 

In [None]:
class MLP(Module):
    """
    A Simple two layers MLP to make SGC a bit better.
    """
    def __init__(self, nfeat, nhid, nclass, dp=0.2):
        super(MLP, self).__init__()
        self.W1 = nn.Linear(nfeat, nhid)
        self.W2 = nn.Linear(nhid, nclass)
        self.dp = dp
        self.act = nn.PReLU()
        self.num_class = nclass

    def forward(self, x):
        x = self.act(self.W1(x))
        x = nn.Dropout(p=self.dp)(x)
        return self.W2(x)


In [None]:
epochs = 200
lr = 0.01
cuda = torch.cuda.is_available()
hidden_channels=16
fastmode = False

"""if cuda:
    model = model.cuda()
    features = features.cuda()
    #adj = adj.cuda()
    #A_tilde = A_tilde.cuda()
    labels = labels.cuda()
    idx_train = idx_train.cuda()
    idx_val = idx_val.cuda()
    idx_test = idx_test.cuda()
"""


model = MLP(features.shape[1], hidden_channels, labels.max().item() + 1)
optimizer = optim.Adam(model.parameters(),lr=lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=50, gamma=0.9)


In [None]:
from torch.utils import data
class FeaturesData(data.Dataset):
  def __init__(self, X, y):
    self.labels = y
    self.features = X

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
    # Select sample
    X = self.features[index]
    y = self.labels[index]
    return X, y


def get_data_loaders(X_train, y_train, X_val, y_val):

  train_set = FeaturesData(X_train, y_train)
  val_set = FeaturesData(X_val, y_val)
  trainLoader = data.DataLoader(dataset=train_set, batch_size=X_train.shape[1], shuffle=True)
  valLoader = data.DataLoader(dataset=val_set, batch_size=X_train.shape[1], shuffle=False)
  return trainLoader, valLoader

trainLoader, valLoader = get_data_loaders(features[idx_train], labels[idx_train], features[idx_val], labels[idx_val])

In [None]:
from time import perf_counter

def train(verbose=True, patience=800):
  best = 0
  best_ep = 0
  wait = 0
  t = perf_counter()

  for epoch in range(epochs):
    train_correct = 0
    for x, y in trainLoader:
      model.train()
      optimizer.zero_grad()
      output = model(x)
      loss_train = criterion(output, y)
      loss_train.backward()
      optimizer.step()
      train_correct += output.argmax(1).eq(y).double().sum()
    # Early stopping
    with torch.no_grad():
      model.eval()
      corrects = 0
      for x, y in valLoader:
        output = model(x)
        loss_val = criterion(output, y)
        corrects += output.argmax(1).eq(y).double().sum()
      acc_val = corrects.item() / labels[idx_val].size(-1)
    if acc_val > best:
      if verbose:
        print("Epoch\t{} - Val acc: {:.4f}".format(epoch, acc_val))
      best = acc_val
      best_ep = epoch
      wait = 0
      torch.save(model.state_dict(), "best_gfnn.pkl")
    else:
      wait+=1
    if wait == patience:
      print("Early stopping at epoch {}".format(epoch))
      break
  train_time = perf_counter()-t
  with torch.no_grad():
    print("Loading at epoch {}".format(best_ep))
    model.load_state_dict(torch.load('best_gfnn.pkl'))
    model.eval()
    for x, y in valLoader:
      output = model(x)
      loss_val = criterion(output, y)
      corrects = output.argmax(1).eq(y).double().sum()
  acc_val = corrects.item() / labels[idx_val].size(-1)
  acc_train = train_correct.item() / labels[idx_train].size(-1)
  

  return model, acc_val, train_time




In [None]:
acc_validation= []
training_time= []
for i in range(5):
  model, acc_val, train_time = train()
  acc_validation.append(acc_val)
  training_time.append(train_time)

Epoch	0 - Val acc: 0.2240
Epoch	1 - Val acc: 0.2260
Epoch	2 - Val acc: 0.2320
Epoch	5 - Val acc: 0.2480
Epoch	6 - Val acc: 0.2740
Epoch	7 - Val acc: 0.2780
Epoch	18 - Val acc: 0.3080
Epoch	19 - Val acc: 0.3100
Epoch	20 - Val acc: 0.3360
Epoch	22 - Val acc: 0.3420
Epoch	24 - Val acc: 0.3780
Epoch	27 - Val acc: 0.3820
Epoch	29 - Val acc: 0.4180
Epoch	35 - Val acc: 0.4320
Epoch	55 - Val acc: 0.4340
Epoch	58 - Val acc: 0.4440
Epoch	64 - Val acc: 0.4460
Epoch	70 - Val acc: 0.4560
Epoch	72 - Val acc: 0.4620
Epoch	74 - Val acc: 0.4700
Epoch	79 - Val acc: 0.4780
Epoch	89 - Val acc: 0.4940
Epoch	108 - Val acc: 0.4960
Epoch	160 - Val acc: 0.5020
Epoch	168 - Val acc: 0.5100
Loading at epoch 168
Epoch	0 - Val acc: 0.4720
Epoch	1 - Val acc: 0.4820
Epoch	3 - Val acc: 0.4860
Epoch	5 - Val acc: 0.4900
Epoch	9 - Val acc: 0.4960
Epoch	14 - Val acc: 0.4980
Epoch	42 - Val acc: 0.5000
Epoch	46 - Val acc: 0.5060
Epoch	56 - Val acc: 0.5120
Loading at epoch 56
Epoch	0 - Val acc: 0.4860
Epoch	1 - Val acc: 0.49

In [None]:
total_val_accuracy = np.mean(acc_validation)
total_training_time = np.mean(training_time)
print('total accuracy', total_val_accuracy)
print('total training time', total_training_time)


total accuracy 0.48040000000000005
total training time 3.970145989000014
