In [1]:
!pip install -U gensim karateclub ogb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# https://stackoverflow.com/questions/68860621/ogb-dataset-i-can-not-import-pygnodeproppreddataset-from-ogb-nodeproppred
# https://stackoverflow.com/questions/67285115/building-wheels-for-torch-sparse-in-colab-takes-forever/73534928#73534928

import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Found existing installation: torch-scatter 2.0.9
Uninstalling torch-scatter-2.0.9:
  Successfully uninstalled torch-scatter-2.0.9
Found existing installation: torch-sparse 0.6.15
Uninstalling torch-sparse-0.6.15:
  Successfully uninstalled torch-sparse-0.6.15
Found existing installation: torch-geometric 2.1.0
Uninstalling torch-geometric-2.1.0:
  Successfully uninstalled torch-geometric-2.1.0
Found existing installation: torch-cluster 1.6.0
Uninstalling torch-cluster-1.6.0:
  Successfully uninstalled torch-cluster-1.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-

In [3]:
from collections import defaultdict 
from karateclub import Node2Vec 
from ogb.nodeproppred import PygNodePropPredDataset 

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from torch_geometric.utils import to_networkx
from typing import List 
from tqdm import tqdm 
from gensim.models import Word2Vec 

import matplotlib.pyplot as plt 
import networkx as nx 
import numpy as np 
import pandas as pd 
import random 
import torch 


In [None]:
def compute_probs(G, p, q):
    """
    Computes probablity distribution of neighbouring nodes
    graph: input graph 
    probs: empty dictionary 
    p: return parameter
    q: in-out parameter
    """
    for source_node in G.nodes(): 
        for current_node in G.neighbors(source_node): 
            probs_ = [] 
            for destination in G.neighbors(current_node): 
                if destination==source_node: 
                    prob_ = G[current_node][destination].get('weight',1)*(1/p) 
                elif destination in G.neighbors(source_node): 
                    prob_ = G[current_node][destination].get('weight',1) 
                else: 
                    prob_ = G[current_node][destination].get('weight',1)*(1/q) 
                probs_.append(prob_)
            probs[source_node]['probablities'][current_node] = probs_/np.sum(probs_)
    return probs


In [None]:
def generate_random_walks(G, probs, max_walks, walk_len):
    """
    :graph: input graph
    :probs: node probablity distribution
    :max_walks: maximum number of random walks per node 
    walk_len: maximum number of nodes in a random walk
    """
    random_walks = [] 
    nodes_with_no_walks = []
    for start_node in G.nodes():
        next_node_options = list(G[start_node])
        if len(next_node_options)==0:
            nodes_with_no_walks.append(start_node)
            break 
        for i in range(max_walks):
            current_walk = []
            current_walk.append(start_node)
            next_node_options = list(G[start_node])
            next_node = np.random.choice(next_node_options)
            current_walk.append(next_node)
            for j in range(walk_len-2):
                next_node_options = list(G[current_walk[-1]])
                if len(next_node_options)==0:
                    break                
                probablities = probs[current_walk[-2]]['probablities'][current_walk[-1]]
                next_node = np.random.choice(next_node_options, p=probablities) 
                current_walk.append(next_node)
            random_walks.append(current_walk)
    random.shuffle(random_walks)
    random_walks = [list(map(str,random_walk)) for random_walk in random_walks]
    return random_walks, nodes_with_no_walks


In [None]:
def node2vec(random_walks, window_size, dimensions): 
    """
    returns word2vec model with given parameters
    """
    model = Word2Vec(sentences=random_walks, vector_size=dimensions, window=window_size) 
    return model.wv 


In [None]:
dimensions = 20 
max_nodes = None # select a subset of nodes for large datasets
node_sampling_stategy = 'sequential' # ('random', 'sequential')
p = 1.0 
q = 0.5 
random_state = 42 
test_size = 0.2 
walk_length = 20 
walk_number = 10 
window_size = 20 

In [None]:
# G = nx.karate_club_graph() 
dataset = PygNodePropPredDataset('ogbn-arxiv') 
G = to_networkx(dataset.data, to_undirected=False)

if max_nodes is not None:
    if node_sampling_stategy=='random':
        subset_nodes = random.sample(G.nodes, max_nodes)
    else:
        subset_nodes = list(G.nodes)[:max_nodes] 
    G = G.subgraph(subset_nodes) 
    mapping = {v:w for w,v in enumerate(sorted(G))}
    G = nx.relabel_nodes(G, mapping)

probs = defaultdict(dict) 
for node in G.nodes(): 
    probs[node]['probablities'] = dict() 
probs = compute_probs(G, p, q) 
random_walks, nodes_with_no_walks = generate_random_walks(G, probs, walk_number, walk_length) 
nv_emb = node2vec(random_walks, window_size, dimensions) 
implemented_embeddings = nv_emb.vectors


3


In [None]:
# club_labels = nx.get_node_attributes(G, 'club') 
# y = np.array(list(club_labels.values()))
y = dataset.data['y']

nodes_in_vocabulary = nv_emb.key_to_index.keys()
y = [int(v) for w,v in enumerate(y) if str(w) in nodes_in_vocabulary]


y = y[:int(1e5)]

In [None]:
implemented_embeddings = implemented_embeddings[:int(1e5)]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(implemented_embeddings, y, random_state=random_state, test_size=test_size) 
model = LogisticRegression() 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred))

0.3333333333333333


In [None]:
model = Node2Vec(walk_length=walk_length, walk_number=walk_number, window_size=window_size, dimensions=dimensions, p=p, q=q) 
model.fit(G) 
karateclub_embeddings = model.get_embedding() 

karateclub_embeddings

In [None]:
X_train, X_test, y_train, y_test = train_test_split(karateclub_embeddings, y, random_state=random_state, test_size=test_size) 
model = LogisticRegression() 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred))

In [None]:
embedding_diff = karateclub_embeddings - implemented_embeddings
embedding_diff_oned = [np.mean(w) for w in embedding_diff]
print(np.mean(embedding_diff_oned))
print(np.std(embedding_diff_oned))