Make a copy of this notebook. When running this notebook on Colab, ensure that you've set your Runtime > Change runtime type to Python 3 and GPU.

---



In [1]:
!pip install --verbose --no-cache-dir torch-scatter
!pip install --verbose --no-cache-dir torch-sparse
!pip install --verbose --no-cache-dir torch-cluster
!pip install torch-geometric
!pip install tensorboardX
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

Created temporary directory: /tmp/pip-ephem-wheel-cache-vg9yp4bc
Created temporary directory: /tmp/pip-req-tracker-x_8gqri8
Created requirements tracker '/tmp/pip-req-tracker-x_8gqri8'
Created temporary directory: /tmp/pip-install-pyewicda
1 location(s) to search for versions of torch-scatter:
* https://pypi.org/simple/torch-scatter/
Getting page https://pypi.org/simple/torch-scatter/
Found index url https://pypi.org/simple
Starting new HTTPS connection (1): pypi.org:443
https://pypi.org:443 "GET /simple/torch-scatter/ HTTP/1.1" 200 1284
Analyzing links from page https://pypi.org/simple/torch-scatter/
  Found link https://files.pythonhosted.org/packages/29/96/566ac314e796d4b07209a3b88cc7a8d2e8582d55819e33f72e6c0e8d8216/torch_scatter-0.3.0.tar.gz#sha256=9e5e5a6efa4ef45f584e8611f83690d799370dd122b862646751ae112b685b50 (from https://pypi.org/simple/torch-scatter/), version: 0.3.0
  Found link https://files.pythonhosted.org/packages/6a/b0/ecffacddf573c147c70c6e43ce05d24f007155ce3fb436959d3

In [0]:
import torch.optim as optim

def build_optimizer(args, params):
    weight_decay = args.weight_decay
    filter_fn = filter(lambda p : p.requires_grad, params)
    if args.opt == 'adam':
        optimizer = optim.Adam(filter_fn, lr=args.lr, weight_decay=weight_decay)
    elif args.opt == 'sgd':
        optimizer = optim.SGD(filter_fn, lr=args.lr, momentum=0.95, weight_decay=weight_decay)
    elif args.opt == 'rmsprop':
        optimizer = optim.RMSprop(filter_fn, lr=args.lr, weight_decay=weight_decay)
    elif args.opt == 'adagrad':
        optimizer = optim.Adagrad(filter_fn, lr=args.lr, weight_decay=weight_decay)
    if args.opt_scheduler == 'none':
        return None, optimizer
    elif args.opt_scheduler == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.opt_decay_step, gamma=args.opt_decay_rate)
    elif args.opt_scheduler == 'cos':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.opt_restart)
    return scheduler, optimizer

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

class GNNStack(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, args, task='node'):
        super(GNNStack, self).__init__()
        conv_model = self.build_conv_model(args.model_type)
        self.convs = nn.ModuleList()
        self.convs.append(conv_model(input_dim, hidden_dim))
        assert (args.num_layers >= 1), 'Number of layers is not >=1'
        for l in range(args.num_layers-1):
            self.convs.append(conv_model(hidden_dim, hidden_dim))

        # post-message-passing
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(args.dropout), 
            nn.Linear(hidden_dim, output_dim))

        self.task = task
        if not (self.task == 'node' or self.task == 'graph'):
            raise RuntimeError('Unknown task.')

        self.dropout = args.dropout
        self.num_layers = args.num_layers

    def build_conv_model(self, model_type):
        if model_type == 'GCN':
            return pyg_nn.GCNConv
        elif model_type == 'GraphSage':
            return GraphSage
        elif model_type == 'GAT':
            # When applying GAT with num heads > 1, one needs to modify the 
            # input and output dimension of the conv layers (self.convs),
            # to ensure that the input dim of the next layer is num heads
            # multiplied by the output dim of the previous layer.
            # HINT: In case you want to play with multiheads, you need to change the for-loop when builds up self.convs to be
            # self.convs.append(conv_model(hidden_dim * num_heads, hidden_dim)), 
            # and also the first nn.Linear(hidden_dim * num_heads, hidden_dim) in post-message-passing.
            return GAT

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        ############################################################################
        # TODO: Your code here! 
        # Each layer in GNN should consist of a convolution (specified in model_type),
        # a non-linearity (use RELU), and dropout. 
        # HINT: the __init__ function contains parameters you will need. For whole
        # graph classification (as specified in self.task) apply max pooling over
        # all of the nodes with pyg_nn.global_max_pool as the final layer.
        # Our implementation is ~6 lines, but don't worry if you deviate from this.

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            # If we turn off dropout during test time, could improve performance
        self.final_embeddings = x

        if self.task == 'graph':
            x = pyg_nn.global_mean_pool(x, batch)

        ############################################################################

        x = self.post_mp(x)

        return F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)


class GraphSage(pyg_nn.MessagePassing):
    """Non-minibatch version of GraphSage."""
    def __init__(self, in_channels, out_channels, reducer='mean', 
                 normalize_embedding=True):
        super(GraphSage, self).__init__(aggr='mean')

        ############################################################################
        # TODO: Your code here! 
        # Define the layers needed for the message and update functions below.
        # self.weight is the linear transformation that you apply to each neighbor before aggregating them
        # self.agg_lin is the linear transformation you apply to the concatenated self embedding (skip connection) and mean aggregated neighbors
        # Our implementation is ~2 lines, but don't worry if you deviate from this.

        self.weight = nn.Linear(in_channels, out_channels)
        self.agg_lin = nn.Linear(in_channels + out_channels, out_channels)

        ############################################################################

        if normalize_embedding:
            self.normalize_emb = True

    def forward(self, x, edge_index):
        num_nodes = x.size(0)
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        return self.propagate(edge_index, size=(num_nodes, num_nodes), x=x)

    def message(self, x_j, edge_index, size):
        # x_j has shape [E, in_channels]
        # edge_index has shape [2, E]
        
        ############################################################################
        # TODO: Your code here! 
        # Given x_j, perform the aggregation of a dense layer followed by a RELU non-linearity.
        # Notice that the aggregator operation will be done in self.propagate. 
        # HINT: It may be useful to read the pyg_nn implementation of GCNConv,
        # https://pytorch-geometric.readthedocs.io/en/latest/notes/create_gnn.html
        # Our implementation is ~1 line, but don't worry if you deviate from this.
        
        x_j = F.relu(self.weight(x_j))

        ############################################################################

        return x_j

    def update(self, aggr_out, x):
        # aggr_out has shape [N, out_channels]
        # x has shape [N, in_channels]
        
        ############################################################################
        # TODO: Your code here! Perform the update step here. 
        # Perform a MLP with skip-connection, that is a concatenation followed by 
        # a linear layer and a RELU non-linearity.
        # Finally, remember to normalize as vector as shown in GraphSage algorithm.
        # Our implementation is ~4 lines, but don't worry if you deviate from this.
        
        aggr_out = F.relu(self.agg_lin(torch.cat([aggr_out, x], dim=1)))
        
        if self.normalize_emb:
            aggr_out = F.normalize(aggr_out, p=2, dim=-1)

        ############################################################################

        return aggr_out


class GAT(pyg_nn.MessagePassing):
    # Please run code with num_heads=1. 
    def __init__(self, in_channels, out_channels, num_heads=1, concat=True,
                 dropout=0, bias=True, **kwargs):
        super(GAT, self).__init__(aggr='add', **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.heads = num_heads
        self.concat = concat 
        self.dropout = dropout

        ############################################################################
        #  TODO: Your code here!
        # Use nn.Linear the layers needed for the forward function. 
        # Remember that the shape of the output depends on the number of heads and out_channels.
        # Our implementation is ~1 line, but don't worry if you deviate from this.

        self.weight = nn.Linear(in_channels, num_heads*out_channels)

        ############################################################################

        ############################################################################
        #  TODO: Your code here!
        # The attention mechanism is a single feed-forward neural network parametrized
        # by weight vector self.att. Define self.att using nn.Parameter needed for the attention
        # mechanism here. Remember to consider number of heads and out_channels for dimension!
        # Also remember that that the attention mechanism is applied to the concatenation
        # of node feaures of two nodes for dimension.
        # Our implementation is ~1 line, but don't worry if you deviate from this.

        self.att = nn.Parameter(torch.Tensor(1, num_heads, 2*out_channels))

        ############################################################################

        if bias and concat:
            self.bias = nn.Parameter(torch.Tensor(self.heads * out_channels))
        elif bias and not concat:
            self.bias = nn.Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        nn.init.xavier_uniform_(self.att)
        nn.init.zeros_(self.bias)

        ############################################################################

    def forward(self, x, edge_index, size=None):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]
        
        ############################################################################
        #  TODO: Your code here!
        # Apply your linear transformation to the node feature matrix x before starting
        # to propagate messages.
        # Our implementation is ~1 line, but don't worry if you deviate from this.
        
        x = self.weight(x)
        ############################################################################

        # Start propagating messages.
        return self.propagate(edge_index, size=size, x=x)

    def message(self, edge_index_i, x_i, x_j, size_i):
        # Constructs messages to node i for each edge (j, i).
        # edge_index_i has shape [E]
        
        ############################################################################
        #  TODO: Your code here! Compute the attention coefficients alpha as described
        # in equation (7). Remember to be careful of the number of heads with dimension!
        # HINT: torch_geometric.utils.softmax may help to calculate softmax for neighbors of i. 
        # https://pytorch-geometric.readthedocs.io/en/latest/modules/utils.html#torch_geometric.utils.softmax
        # Our implementation is ~5 lines, but don't worry if you deviate from this.

        x_j = x_j.view(-1, self.heads, self.out_channels)
        x_i = x_i.view(-1, self.heads, self.out_channels)
        alpha = F.leaky_relu((torch.cat([x_i, x_j], dim=-1) * self.att).sum(dim=-1), 0.2)
        alpha = pyg_utils.softmax(alpha, edge_index_i, size_i)

        ############################################################################

        alpha = F.dropout(alpha, p=self.dropout, training=self.training)

        return x_j * alpha.view(-1, self.heads, 1)

    def update(self, aggr_out):
        # Updates node embedings.
        if self.concat is True:
            aggr_out = aggr_out.view(-1, self.heads * self.out_channels)
        else:
            aggr_out = aggr_out.mean(dim=1)

        if self.bias is not None:
            aggr_out = aggr_out + self.bias
        return aggr_out


In [0]:




import time

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.data import DataLoader

import torch_geometric.nn as pyg_nn
import matplotlib.pyplot as plt
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm
import pandas as pd
from torch_geometric.data import Data
from google.colab import files

from google.colab import drive
drive.mount('/content/gdrive')


class YoutubeVideoDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YoutubeVideoDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return 'data.pt'

    def download(self):
        pass
    
    def process(self):

        nodes_url = 'https://raw.githubusercontent.com/kngoledge/CS224W-PRJ/master/data/nodes.csv'
        edges_url = 'https://raw.githubusercontent.com/kngoledge/CS224W-PRJ/master/data/edges.csv'
        upload_col = 'category'

        nodes = pd.read_csv(nodes_url, sep='\t', index_col=0)
        edges = pd.read_csv(edges_url, sep='\t', index_col=0)
        categories = set()
        for i in range(nodes.shape[0]):
          categories.add(nodes.iloc[i][upload_col])
        print(categories)
        idx_to_categories = list(categories)
        categories_to_idx = dict()
        for i in range(len(idx_to_categories)):
          categories_to_idx[idx_to_categories[i]] = i
        # Dataset is now stored in a Pandas Dataframe

        # 1 feature initialization
        #x = torch.tensor(np.ones((len(nodes),1)), dtype=torch.float)

        # Feature vector using all other data (except uploader)
        feature_vectors = np.asarray([nodes['age'], nodes['length'], nodes['views'], nodes['rate'], nodes['ratings'], nodes['comments']])
        feature_vectors = feature_vectors.T
        col_mean = np.nanmean(feature_vectors, axis=0)
        inds = np.where(np.isnan(feature_vectors))
        feature_vectors[inds] = np.take(col_mean, inds[1])
        print(feature_vectors[118])
        print("Mean feature:",np.mean(feature_vectors, axis = 0))
        print("Standard deviation feature:",np.std(feature_vectors, axis = 0))
        feature_vectors = (feature_vectors - np.mean(feature_vectors, axis = 0)[np.newaxis,:]) / np.std(feature_vectors, axis = 0)[np.newaxis,:]
        x = torch.tensor(feature_vectors, dtype=torch.float)


        labels = [categories_to_idx[x] for x in nodes[upload_col]]
        y = torch.tensor(labels, dtype=torch.long)

        edge_index = torch.tensor([edges["u"], edges["v"]], dtype=torch.long)


        data = Data(x=x, y=y, edge_index=edge_index)
        train_array = np.ones(len(nodes))
        val_array = np.zeros(len(nodes))
        test_array = np.zeros(len(nodes))
        chosen_indices = np.random.choice(len(nodes), len(nodes)//5, replace=False).astype(int)
        for i in range(len(chosen_indices)):
          if i <= len(chosen_indices)//2:
            val_array[chosen_indices[i]] += 1
          else:
            test_array[chosen_indices[i]] += 1
          train_array[chosen_indices[i]] -= 1
        
        data.train_mask = torch.tensor(train_array, dtype=torch.bool)
        data.val_mask = torch.tensor(val_array, dtype=torch.bool)
        data.test_mask = torch.tensor(test_array, dtype=torch.bool)
        
        data_list = [data]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])



graphs = dict()
files_to_download = []

def train(dataset, task, args):
    if task == 'graph':
        # graph classification: separate dataloader for test set
        data_size = len(dataset)
        loader = DataLoader(
                dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True)
        test_loader = DataLoader(
                dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True)
    elif task == 'node':
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    print("Number of node features:",dataset.num_node_features)
    print("Number of classes:",dataset.num_classes)
    model = GNNStack(dataset.num_node_features, args.hidden_dim, int(dataset.num_classes), 
                            args, task=task)
    scheduler, opt = build_optimizer(args, model.parameters())

    # train
    model_validation = []
    xs = []
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
            
        total_loss /= len(loader.dataset)
        print(total_loss)

        if epoch % 10 == 0:
            test_acc = test(test_loader, model, is_validation=True)
            print(test_acc,   '  test')
            model_validation.append(test_acc)
            xs.append(epoch)
        if epoch == args.epochs - 1:
            file_name = 'gdrive/My Drive/'+args.model_type+'_node_embeddings_with_feature_trial.npy'
            np.save(file_name,model.final_embeddings.detach().numpy())
            files_to_download.append(file_name)
    
    

    if graphs.get(args.dataset, None) is None:
        graphs[args.dataset] = dict()
        graphs[args.dataset]["models"] = []
        graphs[args.dataset]["validations"] = []
        graphs[args.dataset]["xs"] = []
    graphs[args.dataset]["models"].append(args.model_type)
    graphs[args.dataset]["validations"].append(model_validation)
    graphs[args.dataset]["xs"].append(xs)

def test(loader, model, is_validation=False):
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            # max(dim=1) returns values, indices tuple; only need indices
            pred = model(data).max(dim=1)[1]
            label = data.y

        if model.task == 'node':
            mask = data.val_mask if is_validation else data.test_mask
            # node classification: only evaluate on nodes in test set
            pred = pred[mask]
            label = data.y[mask]
            
        correct += pred.eq(label).sum().item()
    
    cora_number = None
    if model.task == 'graph':
        total = len(loader.dataset) 
    else:
        total = 0
        for data in loader.dataset:
            total += torch.sum(data.val_mask if is_validation else data.test_mask).item()
            if cora_number is None:
              cora_number = torch.sum(data.test_mask).item()
    if model.task == 'node':
        print("Youtube test set number of nodes:",cora_number)
    else:
        print("ENZYMES test set number of graphs:",total)
    return correct / total
  
class objectview(object):
    def __init__(self, d):
        self.__dict__ = d

def main():
  for args in [
      {'model_type': 'GCN', 'dataset': 'youtube'   , 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 64, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.1},
      {'model_type': 'GraphSage', 'dataset': 'youtube'   , 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 64, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.1},
      {'model_type': 'GAT', 'dataset': 'youtube'   , 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 64, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.1},
  ]:
    args = objectview(args)
    if args.dataset == 'youtube':
        dataset = YoutubeVideoDataset(root='/tmp/Youtube')
        task = 'node'
    train(dataset, task, args)
  
  # Plot graphs
  for dataset in graphs:
    plt.figure()
    for i in range(len(graphs[dataset]["models"])):
      plt.plot(graphs[dataset]["xs"][i], graphs[dataset]["validations"][i], label=graphs[dataset]["models"][i])
    plt.title("Validation Accuracy for "+dataset.upper()+" Dataset")
    plt.xlabel('Epoch')
    plt.ylabel('Validation Accuracy')
    plt.legend()
    plt.show()
  
  import os
  print( os.getcwd() )
  print( os.listdir(os.getcwd()) )
  #time.sleep(10)
  print(files_to_download)
  #for embed_file_name in files_to_download:
  #    files.download(embed_file_name)

if __name__ == '__main__':
    main()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Number of node features: 6
Number of classes: 17
2.8291432857513428
Youtube test set number of nodes: 6020
0.2883241986381   test
4.391548156738281
3.8294167518615723
2.641700267791748
2.555171012878418
2.3178932666778564
2.286609649658203
2.5494332313537598
2.3836233615875244
2.2733144760131836
2.4194447994232178
Youtube test set number of nodes: 6020
0.23368211260587943   test
2.399092435836792
2.3869662284851074
2.2542243003845215
2.2623157501220703
2.2795286178588867
2.3553519248962402
2.250206470489502
2.2286579608917236
2.2254815101623535
2.2201898097991943
Youtube test set number of nodes: 6020
0.2795216741405082   test
2.2072556018829346
2.2056477069854736
2.20430850982666
2.1963343620300293
2.188124656677246
2.1812548637390137
2.1817026138305664
2.1740562915802
2.1696219444274902
2.162987470626831
Youtube test set number of nodes: 6020
0.3097492110