# GAE NODES EMBEDDINGS

In this notebook we compute the nodes embeddings using a graph auto-encoder (GAE).

In [2]:
import os
import gc
import time
import pickle
import networkx as nx
import numpy as np
import scipy.sparse as sp
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from gensim.models.doc2vec import Doc2Vec
import paths

Mounted at /content/drive


In [3]:
# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
device

device(type='cuda')

## 1. Load the graphs

In [5]:
doc2vec_model = Doc2Vec.load(paths.DOC2VEC_PATH)

In [6]:
doc2vec_model.docvecs.vectors_docs.shape

(138499, 64)

In [7]:
def normalize_adjacency(A):
    A_tilde = A + sp.identity(A.shape[0])
    D_tilde = sp.diags(A_tilde.sum(axis=1).A1)
    A_normalized = D_tilde.power(-1) @ A_tilde
    return A_normalized


def sparse_to_torch_sparse(M):
    """Converts a sparse SciPy matrix to a sparse PyTorch tensor"""
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col)).astype(np.int64))
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


def loss_function(z, adj, device):
    mse_loss = nn.MSELoss()

    sigmoid = nn.Sigmoid()
    ############## Task 3
    
    ##################
    y = list()
    y_pred = list()

    indices = adj._indices()
    m = indices.size(1)

    y.append(torch.ones(m).to(device))
    y_pred.append(sigmoid(torch.sum(torch.mul(z[indices[0],:], z[indices[1], :]), dim=1)))

    rand_indices = torch.randint(0, z.size(0), indices.size())
    y.append(torch.zeros(m).to(device))
    y_pred.append(sigmoid(torch.sum(torch.mul(z[rand_indices[0],:], z[rand_indices[1], :]), dim=1)))

    y = torch.cat(y, dim=0)
    y_pred = torch.cat(y_pred, dim=0)
    ##################
    
    loss = mse_loss(y_pred, y)
    return loss

In [8]:
class GAE(nn.Module):
    """GAE model"""
    def __init__(self, n_feat, n_hidden_1, n_hidden_2, dropout):
        super(GAE, self).__init__()

        self.fc1 = nn.Linear(n_feat, n_hidden_1)
        self.fc2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj):
        H = self.relu(torch.spmm(adj, self.fc1(x_in)))
        H = self.dropout(H)
        z = torch.spmm(adj, self.fc2(H))
        return z

In [9]:
# Hyperparameters
epochs = 200
n_hidden_1 = 32
n_hidden_2 = 64
learning_rate = 0.01
dropout_rate = 0.1

In [10]:
def train_gae(graph_path):
    # Loads the karate network
    G = nx.read_edgelist(graph_path, delimiter=',', nodetype=int, create_using=nx.Graph())
    print(G.number_of_nodes())
    print(G.number_of_edges())

    n = G.number_of_nodes()

    adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix
    adj = normalize_adjacency(adj) # Normalizes the adjacency matrix

    features = doc2vec_model.docvecs.vectors_docs # Generates node features

    # Transforms the numpy matrices/vectors to torch tensors
    features = torch.FloatTensor(features).to(device)
    adj = sparse_to_torch_sparse(adj).to(device)

    # Creates the model and specifies the optimizer
    model = GAE(features.shape[1], n_hidden_1, n_hidden_2, dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Trains the model
    for epoch in range(epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        z = model(features, adj)
        loss = loss_function(z, adj, device)
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print('Epoch: {:04d}'.format(epoch+1),
                'loss_train: {:.4f}'.format(loss.item()),
                'time: {:.4f}s'.format(time.time() - t))
    
    return z

In [11]:
if not os.path.isfile(paths.FULL_GRAPH_GAE_EMB_PATH):
  with open(paths.FULL_GRAPH_GAE_EMB_PATH, 'wb') as f:
    z_full_graph = train_gae(paths.FULL_GRAPH_EDGELIST_PATH)
    z_full_graph_matrix = z_full_graph.detach().cpu().numpy()
    pickle.dump(z_full_graph_matrix, f)

    del z_full_graph, z_full_graph_matrix
    gc.collect()

138499
1091955
Epoch: 0001 loss_train: 0.3861 time: 0.3861s
Epoch: 0011 loss_train: 0.2405 time: 0.2223s
Epoch: 0021 loss_train: 0.2280 time: 0.2150s
Epoch: 0031 loss_train: 0.2129 time: 0.2206s
Epoch: 0041 loss_train: 0.2060 time: 0.2177s
Epoch: 0051 loss_train: 0.1989 time: 0.2133s
Epoch: 0061 loss_train: 0.1956 time: 0.2103s
Epoch: 0071 loss_train: 0.1929 time: 0.2149s
Epoch: 0081 loss_train: 0.1905 time: 0.2110s
Epoch: 0091 loss_train: 0.1882 time: 0.2101s
Epoch: 0101 loss_train: 0.1870 time: 0.2110s
Epoch: 0111 loss_train: 0.1855 time: 0.2109s
Epoch: 0121 loss_train: 0.1845 time: 0.2131s
Epoch: 0131 loss_train: 0.1837 time: 0.2114s
Epoch: 0141 loss_train: 0.1827 time: 0.2123s
Epoch: 0151 loss_train: 0.1818 time: 0.2172s
Epoch: 0161 loss_train: 0.1807 time: 0.2112s
Epoch: 0171 loss_train: 0.1799 time: 0.2176s
Epoch: 0181 loss_train: 0.1788 time: 0.2157s
Epoch: 0191 loss_train: 0.1782 time: 0.2131s


In [12]:
if not os.path.isfile(paths.TRAIN_GRAPH_GAE_EMB_PATH):
  with open(paths.TRAIN_GRAPH_GAE_EMB_PATH, 'wb') as f:
    z_train = train_gae(paths.TRAIN_EDGELIST_PATH)
    z_train_matrix = z_train.detach().cpu().numpy()
    pickle.dump(z_train_matrix, f)

    del z_train, z_train_matrix
    gc.collect()

138499
698852
Epoch: 0001 loss_train: 0.3460 time: 0.1561s
Epoch: 0011 loss_train: 0.2387 time: 0.1489s
Epoch: 0021 loss_train: 0.2259 time: 0.1568s
Epoch: 0031 loss_train: 0.2112 time: 0.1464s
Epoch: 0041 loss_train: 0.2023 time: 0.1532s
Epoch: 0051 loss_train: 0.1969 time: 0.1505s
Epoch: 0061 loss_train: 0.1931 time: 0.1452s
Epoch: 0071 loss_train: 0.1908 time: 0.1456s
Epoch: 0081 loss_train: 0.1889 time: 0.1454s
Epoch: 0091 loss_train: 0.1874 time: 0.1468s
Epoch: 0101 loss_train: 0.1862 time: 0.1455s
Epoch: 0111 loss_train: 0.1855 time: 0.1510s
Epoch: 0121 loss_train: 0.1844 time: 0.1469s
Epoch: 0131 loss_train: 0.1837 time: 0.1458s
Epoch: 0141 loss_train: 0.1827 time: 0.1472s
Epoch: 0151 loss_train: 0.1819 time: 0.1451s
Epoch: 0161 loss_train: 0.1815 time: 0.1454s
Epoch: 0171 loss_train: 0.1805 time: 0.1453s
Epoch: 0181 loss_train: 0.1801 time: 0.1516s
Epoch: 0191 loss_train: 0.1787 time: 0.1547s


In [13]:
if not os.path.isfile(paths.TEST_GRAPH_GAE_EMB_PATH):
  with open(paths.TEST_GRAPH_GAE_EMB_PATH, 'wb') as f:
    z_test = train_gae(paths.TEST_EDGELIST_PATH)
    z_test_matrix = z_test.detach().cpu().numpy()
    pickle.dump(z_test_matrix, f)

    del z_test, z_test_matrix
    gc.collect()

138499
873564
Epoch: 0001 loss_train: 0.3780 time: 0.1840s
Epoch: 0011 loss_train: 0.2420 time: 0.1754s
Epoch: 0021 loss_train: 0.2322 time: 0.1768s
Epoch: 0031 loss_train: 0.2184 time: 0.1899s
Epoch: 0041 loss_train: 0.2106 time: 0.1798s
Epoch: 0051 loss_train: 0.2025 time: 0.1748s
Epoch: 0061 loss_train: 0.1974 time: 0.1815s
Epoch: 0071 loss_train: 0.1930 time: 0.1755s
Epoch: 0081 loss_train: 0.1906 time: 0.1848s
Epoch: 0091 loss_train: 0.1886 time: 0.1773s
Epoch: 0101 loss_train: 0.1868 time: 0.1772s
Epoch: 0111 loss_train: 0.1852 time: 0.1804s
Epoch: 0121 loss_train: 0.1835 time: 0.1754s
Epoch: 0131 loss_train: 0.1824 time: 0.1793s
Epoch: 0141 loss_train: 0.1810 time: 0.1772s
Epoch: 0151 loss_train: 0.1795 time: 0.1744s
Epoch: 0161 loss_train: 0.1781 time: 0.1804s
Epoch: 0171 loss_train: 0.1768 time: 0.1754s
Epoch: 0181 loss_train: 0.1759 time: 0.1780s
Epoch: 0191 loss_train: 0.1745 time: 0.1766s
