In [0]:
%%capture
!pip install torch_geometric
!pip install torch_sparse
!pip install torch_scatter
!pip install torch_cluster
!pip install torch-spline-conv


In [0]:
!git clone https://github.com/leonardocunha2107/Link-Prediction
!mv Link-Prediction/* .
!rm -rf Link-Prediction

num_nodes=33226

Cloning into 'Link-Prediction'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects:   2% (1/36)[Kremote: Counting objects:   5% (2/36)[Kremote: Counting objects:   8% (3/36)[Kremote: Counting objects:  11% (4/36)[Kremote: Counting objects:  13% (5/36)[Kremote: Counting objects:  16% (6/36)[Kremote: Counting objects:  19% (7/36)[Kremote: Counting objects:  22% (8/36)[Kremote: Counting objects:  25% (9/36)[Kremote: Counting objects:  27% (10/36)[Kremote: Counting objects:  30% (11/36)[Kremote: Counting objects:  33% (12/36)[Kremote: Counting objects:  36% (13/36)[Kremote: Counting objects:  38% (14/36)[Kremote: Counting objects:  41% (15/36)[Kremote: Counting objects:  44% (16/36)[Kremote: Counting objects:  47% (17/36)[Kremote: Counting objects:  50% (18/36)[Kremote: Counting objects:  52% (19/36)[Kremote: Counting objects:  55% (20/36)[Kremote: Counting objects:  58% (21/36)[Kremote: Counting objects:  61% (22/36)[Kremote: C

In [0]:
import torch
import torch.nn as nn 
from sklearn.linear_model import LogisticRegression
import numpy as np
from torch_cluster import random_walk

EPS = 1e-15


class Node2Vec(torch.nn.Module):
    r"""The Node2Vec model from the
    `"node2vec: Scalable Feature Learning for Networks"
    <https://arxiv.org/abs/1607.00653>`_ paper where random walks of
    length :obj:`walk_length` are sampled in a given graph, and node embeddings
    are learned via negative sampling optimization.

    Args:
        num_nodes (int): The number of nodes.
        embedding_dim (int): The size of each embedding vector.
        walk_length (int): The walk length.
        context_size (int): The actual context size which is considered for
            positive samples. This parameter increases the effective sampling
            rate by reusing samples across different source nodes.
        walks_per_node (int, optional): The number of walks to sample for each
            node. (default: :obj:`1`)
        p (float, optional): Likelihood of immediately revisiting a node in the
            walk. (default: :obj:`1`)
        q (float, optional): Control parameter to interpolate between
            breadth-first strategy and depth-first strategy (default: :obj:`1`)
        num_negative_samples (int, optional): The number of negative samples to
            use for each node. If set to :obj:`None`, this parameter gets set
            to :obj:`context_size - 1`. (default: :obj:`None`)
    """

    def __init__(self, num_nodes, embedding_dim, walk_length, context_size,text_embed,
                 walks_per_node=1, p=1, q=1, num_negative_samples=None,embed_layers=2):
        super(Node2Vec, self).__init__()
        assert walk_length >= context_size
        if type(text_embed)==str:
            text_tensor=torch.tensor(np.load(text_embed))
        else:
            text_tensor=torch.tensor(text_embed)
        self.text_embed=nn.Embedding.from_pretrained(text_tensor)
        module_list=[nn.Linear(text_tensor.shape[1],embedding_dim)]+ [nn.ReLU(),nn.Linear(embedding_dim,embedding_dim)]*(embed_layers-1)
        self.embedder=nn.Sequential(*module_list)
        
        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.walk_length = walk_length - 1
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.p = p
        self.q = q
        self.num_negative_samples = num_negative_samples

        self.embedding = torch.nn.Embedding(num_nodes, embedding_dim)

        self.reset_parameters()

    def reset_parameters(self):
        self.embedding.reset_parameters()

    def forward(self, subset):
        """Returns the embeddings for the nodes in :obj:`subset`."""
        with torch.no_grad():
            x=self.text_embed(subset)
        return self.embedder(x)


    def __random_walk__(self, edge_index, subset=None):
        if subset is None:
            subset = torch.arange(self.num_nodes, device=edge_index.device)
        subset = subset.repeat(self.walks_per_node)

        rw = random_walk(edge_index[0], edge_index[1], subset,
                         self.walk_length, self.p, self.q, self.num_nodes)

        walks = []
        num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size
        for j in range(num_walks_per_rw):
            walks.append(rw[:, j:j + self.context_size])
        return torch.cat(walks, dim=0)

    def loss(self, edge_index, subset=None):
        r"""Computes the loss for the nodes in :obj:`subset` with negative
        sampling."""
        walk = self.__random_walk__(edge_index, subset)
        start, rest = walk[:, 0], walk[:, 1:].contiguous()
        with torch.no_grad():
            h_start,h_rest=self.text_embed[start],self.text_embed[rest]
        h_start = self.embedder(h_start).view(
            walk.size(0), 1, self.embedding_dim)
        h_rest = self.embedder(h_rest.view(-1)).view(
            walk.size(0), rest.size(1), self.embedding_dim)

        out = (h_start * h_rest).sum(dim=-1).view(-1)
        pos_loss = -torch.log(torch.sigmoid(out) + EPS).mean()

        # Negative sampling loss.
        num_negative_samples = self.num_negative_samples
        if num_negative_samples is None:
            num_negative_samples = rest.size(1)

        neg_sample = torch.randint(self.num_nodes,
                                   (walk.size(0), num_negative_samples),
                                   dtype=torch.long, device=edge_index.device)
        with torch.no_grad():
            h_neg_rest=self.text_embed(neg_sample)
        h_neg_rest = self.embedder(h_neg_rest)

        out = (h_start * h_neg_rest).sum(dim=-1).view(-1)
        neg_loss = -torch.log(1 - torch.sigmoid(out) + EPS).mean()

        return pos_loss + neg_loss


    def test(self, train_z, train_y, test_z, test_y, solver='lbfgs',
             multi_class='auto', *args, **kwargs):
        r"""Evaluates latent space quality via a logistic regression downstream
        task."""
        clf = LogisticRegression(solver=solver, multi_class=multi_class, *args,
                                 **kwargs).fit(train_z.detach().cpu().numpy(),
                                               train_y.detach().cpu().numpy())
        return clf.score(test_z.detach().cpu().numpy(),
                         test_y.detach().cpu().numpy())


    def __repr__(self):
        return '{}({}, {}, p={}, q={})'.format(
            self.__class__.__name__, self.num_nodes, self.embedding_dim,
            self.p, self.q)

In [0]:
import torch
from data import  get_edge_tensor
from torch_geometric.nn.models import Node2Vec

device  = torch.device("cuda")
d=128
num_nodes=33226
n2v=Node2Vec(num_nodes+2,d,5,3)#,'text_embeds.npy',walks_per_node=2).to(device)
optimizer=torch.optim.Adam(n2v.parameters(),lr=1e-2)
edges_tensor=get_edge_tensor(device=device)
print(edges_tensor.shape)
for i in range(19):
    loss=n2v.loss(edges_tensor)
    loss.backward(retain_graph=True)
    optimizer.step()
    print(f"Loss {loss.data}")
    

torch.Size([2, 567246])


RuntimeError: ignored

In [0]:
from draw import draw

draw(n2v(torch.arange(num_nodes)))

In [0]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()
lr.fit(x,y)


