In [36]:
!pip install -U gensim karateclub ogb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
# https://stackoverflow.com/questions/68860621/ogb-dataset-i-can-not-import-pygnodeproppreddataset-from-ogb-nodeproppred
# https://stackoverflow.com/questions/67285115/building-wheels-for-torch-sparse-in-colab-takes-forever/73534928#73534928

import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Found existing installation: torch-scatter 2.0.9
Uninstalling torch-scatter-2.0.9:
  Successfully uninstalled torch-scatter-2.0.9
Found existing installation: torch-sparse 0.6.15
Uninstalling torch-sparse-0.6.15:
  Successfully uninstalled torch-sparse-0.6.15
Found existing installation: torch-geometric 2.1.0
Uninstalling torch-geometric-2.1.0:
  Successfully uninstalled torch-geometric-2.1.0
Found existing installation: torch-cluster 1.6.0
Uninstalling torch-cluster-1.6.0:
  Successfully uninstalled torch-cluster-1.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-sparse
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15-cp37-cp37m-linux_x86_64.whl (3.5 MB)
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.15


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-cluster
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl (2.4 MB)
Installing collected packages: torch-cluster
Successfully installed torch-cluster-1.6.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pytorch_geometric.git to /tmp/pip-req-build-zb1qe7gm
  Running command git clone -q https://github.com/pyg-team/pytorch_geometric.git /tmp/pip-req-build-zb1qe7gm
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.1.0-py3-none-any.whl size=755541 sha256=cc8a17782e43ff034585ca0db2c86242400b0116cd9565ca8b2bf2e24d78bce1
  Stored in directory: /tmp/pip-ephem-wheel-cache-x3t8aedf/wheels/85/c9/07/7936efecad79b906348a7e9fb644d914160544efa9aa7f4b2b
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.1.0


In [38]:
from collections import defaultdict 
from karateclub import Node2Vec 
from ogb.nodeproppred import PygNodePropPredDataset 

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split 
from torch_geometric.utils import to_networkx
from typing import List 
from tqdm import tqdm 
from gensim.models import Word2Vec 

import matplotlib.pyplot as plt 
import networkx as nx 
import numpy as np 
import pandas as pd 
import random 
import torch 

import warnings
warnings.filterwarnings("ignore")

In [39]:
def compute_probs(G, p, q):
    """
    Computes probablity distribution of neighbouring nodes
    graph: input graph 
    probs: empty dictionary 
    p: return parameter
    q: in-out parameter
    """
    for source_node in G.nodes(): 
        for current_node in G.neighbors(source_node): 
            probs_ = [] 
            for destination in G.neighbors(current_node): 
                if destination==source_node: 
                    prob_ = G[current_node][destination].get('weight',1)*(1/p) 
                elif destination in G.neighbors(source_node): 
                    prob_ = G[current_node][destination].get('weight',1) 
                else: 
                    prob_ = G[current_node][destination].get('weight',1)*(1/q) 
                probs_.append(prob_)
            probs[source_node]['probablities'][current_node] = probs_/np.sum(probs_)
    return probs


In [40]:
def generate_random_walks(G, probs, max_walks, walk_len):
    """
    :graph: input graph
    :probs: node probablity distribution
    :max_walks: maximum number of random walks per node 
    walk_len: maximum number of nodes in a random walk
    """
    random_walks = [] 
    for start_node in G.nodes(): 
        next_node_options = list(G[start_node])
        if len(next_node_options)==0:
            break
        for i in range(max_walks):
            current_walk = [start_node]
            next_node_options = list(G[start_node])
            next_node = np.random.choice(next_node_options)
            current_walk.append(next_node)
            for j in range(walk_len-2):
                next_node_options = list(G[current_walk[-1]])
                if len(next_node_options)==0:
                    break                
                probablities = probs[current_walk[-2]]['probablities'][current_walk[-1]]
                next_node = np.random.choice(next_node_options, p=probablities) 
                current_walk.append(next_node)
            random_walks.append(current_walk)
    random.shuffle(random_walks)
    random_walks = [list(map(str,random_walk)) for random_walk in random_walks]
    return random_walks 


In [41]:
def node2vec(random_walks, window_size, vector_size): 
    """
    returns word2vec model with given parameters
    """
    model = Word2Vec(sentences=random_walks, vector_size=vector_size, window=window_size) 
    return model.wv 


In [42]:
# https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/arxiv/node2vec.py

vector_size = 128  
max_nodes = None # select a subset of nodes for large datasets
node_sampling_stategy = 'sequential' # ('random', 'sequential')
p = 1.0 
q = 1.0 
random_state = 42 
test_size = 0.2 
walk_length = 80 
walk_number = 10  
window_size = 20 

In [43]:
dataset = PygNodePropPredDataset('ogbn-arxiv') 
G = to_networkx(dataset.data, to_undirected=False)

if max_nodes is not None:
    print('sampling nodes')
    if node_sampling_stategy=='random':
        subset_nodes = random.sample(G.nodes, max_nodes)
    else:
        subset_nodes = list(G.nodes)[:max_nodes] 
    G = G.subgraph(subset_nodes) 
    mapping = {v:w for w,v in enumerate(sorted(G))}
    G = nx.relabel_nodes(G, mapping)

probs = defaultdict(dict) 
for node in G.nodes(): 
    probs[node]['probablities'] = dict() 
probs = compute_probs(G, p, q) 
random_walks = generate_random_walks(G, probs, walk_number, walk_length) 
nv_emb = node2vec(random_walks, window_size, vector_size) 
implemented_embeddings = nv_emb.vectors


In [44]:
y = dataset.data['y']

nodes_in_vocabulary = nv_emb.key_to_index.keys()
y = [int(v) for w,v in enumerate(y) if str(w) in nodes_in_vocabulary]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(implemented_embeddings, y, random_state=random_state, test_size=test_size) 

model = LogisticRegression() 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred)) 

0.5


In [46]:
# print(classification_report(y_test, y_pred))

In [47]:
# model = Node2Vec(walk_length=walk_length, walk_number=walk_number, window_size=window_size, dimensions=vector_size, p=p, q=q) 
# model.fit(G) 
# karateclub_embeddings = model.get_embedding() 

In [48]:
# X_train, X_test, y_train, y_test = train_test_split(karateclub_embeddings, y, random_state=random_state, test_size=test_size) 
# model = LogisticRegression() 
# model.fit(X_train, y_train) 
# y_pred = model.predict(X_test) 
# print(accuracy_score(y_test,y_pred))

In [49]:
# print(classification_report(y_test, y_pred))