In [25]:
!pip install -U gensim karateclub ogb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
# https://stackoverflow.com/questions/68860621/ogb-dataset-i-can-not-import-pygnodeproppreddataset-from-ogb-nodeproppred
# https://stackoverflow.com/questions/67285115/building-wheels-for-torch-sparse-in-colab-takes-forever/73534928#73534928

import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Found existing installation: torch-scatter 2.0.9
Uninstalling torch-scatter-2.0.9:
  Successfully uninstalled torch-scatter-2.0.9
Found existing installation: torch-sparse 0.6.15
Uninstalling torch-sparse-0.6.15:
  Successfully uninstalled torch-sparse-0.6.15
Found existing installation: torch-geometric 2.1.0
Uninstalling torch-geometric-2.1.0:
  Successfully uninstalled torch-geometric-2.1.0
Found existing installation: torch-cluster 1.6.0
Uninstalling torch-cluster-1.6.0:
  Successfully uninstalled torch-cluster-1.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-sparse
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15-cp37-cp37m-linux_x86_64.whl (3.5 MB)
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.15


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-cluster
  Using cached https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl (2.4 MB)
Installing collected packages: torch-cluster
Successfully installed torch-cluster-1.6.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pytorch_geometric.git to /tmp/pip-req-build-1cb4eeoy
  Running command git clone -q https://github.com/pyg-team/pytorch_geometric.git /tmp/pip-req-build-1cb4eeoy
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.1.0-py3-none-any.whl size=755541 sha256=f9e0626487821bff372ec7f51196975d4e472488011cf4523b2eebceaa796816
  Stored in directory: /tmp/pip-ephem-wheel-cache-ynipglxv/wheels/85/c9/07/7936efecad79b906348a7e9fb644d914160544efa9aa7f4b2b
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.1.0


In [27]:
from karateclub import GraRep 
from ogb.nodeproppred import PygNodePropPredDataset 
from scipy.sparse import coo_matrix, identity
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split 
from torch_geometric.utils import to_networkx
from typing import List 
from tqdm import tqdm 

import math 
import matplotlib.pyplot as plt 
import networkx as nx 
import numpy as np 
import pandas as pd 
import random 
import torch 

import warnings
warnings.filterwarnings("ignore")

In [28]:
def sparse_identity_matrix(dim, dtype=np.float32):
    A = identity(dim, dtype=dtype) 
    A = coo_matrix(A)  
    return A 

def get_target_matrix(A):
    A_hat = sparse_identity_matrix(A.shape[0]) 
    scores = np.log(A_hat.data) - math.log(A.shape[0]) 
    rows = A_hat.row[scores < 0]
    cols = A_hat.col[scores < 0]
    scores = scores[scores < 0]
    target_matrix = coo_matrix((scores, (rows, cols)),
                                        shape=A.shape,
                                        dtype=np.float32) 
    return target_matrix

In [29]:
dimensions = 64   
max_nodes = None # select a subset of nodes for large datasets 
node_sampling_stategy = 'sequential' # ('random', 'sequential') 
order = 5 # K 
iterations = 100 # number of SVD iterations 
max_iter = int(1e5) # LogisticRegression 
random_state = 42 
test_size = 0.2 

In [30]:
dataset = PygNodePropPredDataset('ogbn-arxiv') 
G = to_networkx(dataset.data, to_undirected=False)
if max_nodes is not None:
    if node_sampling_stategy=='random':
        subset_nodes = random.sample(G.nodes, max_nodes)
    else:
        subset_nodes = list(G.nodes)[:max_nodes] 
    G = G.subgraph(subset_nodes) 
    mapping = {v:w for w,v in enumerate(sorted(G))}
    G = nx.relabel_nodes(G, mapping) 

In [31]:
# calculate the inverse degree matrix 
ind = range(len(G.nodes())) 
degs = [1.0/G.degree(node) for node in range(G.number_of_nodes())] 
D_1 = coo_matrix((degs, (ind, ind)),
                         shape=(G.number_of_nodes(), 
                         G.number_of_nodes()), 
                         dtype=np.float32) 

# (1-step) probability transition matrix 
A = coo_matrix(nx.adjacency_matrix(G), dtype=np.float32)
A = A.dot(D_1)   

In [32]:
# calculate embeddings 
 
embeddings = []
for step in tqdm(range(order)): 
    target_matrix = get_target_matrix(A) 

    svd = TruncatedSVD(n_components=dimensions,
                        n_iter=iterations,
                        random_state=random_state)

    svd.fit(target_matrix)
    embedding = svd.transform(target_matrix)
    embeddings.append(embedding)
implemented_embeddings = np.concatenate(np.array(embeddings),axis=1) 

100%|██████████| 5/5 [05:33<00:00, 66.69s/it]


In [33]:
y = dataset.data['y'] 

In [34]:
X_train, X_test, y_train, y_test = train_test_split(implemented_embeddings, y, random_state=random_state, test_size=test_size) 

model = LogisticRegression(max_iter=max_iter) 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred)) 

0.1587292214119106


In [35]:
# # karateclub model

# model = GraRep(dimensions=dimensions, iteration=iterations, order=order, seed=random_state) 
# model.fit(G) 
# karateclub_embeddings = model.get_embedding() 

In [36]:
# X_train, X_test, y_train, y_test = train_test_split(karateclub_embeddings, y, random_state=random_state, test_size=test_size) 
# model = LogisticRegression() 
# model.fit(X_train, y_train) 
# y_pred = model.predict(X_test) 
# print(accuracy_score(y_test,y_pred))