In [None]:
!pip install -U gensim karateclub ogb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# # https://stackoverflow.com/questions/68860621/ogb-dataset-i-can-not-import-pygnodeproppreddataset-from-ogb-nodeproppred
# # https://stackoverflow.com/questions/67285115/building-wheels-for-torch-sparse-in-colab-takes-forever/73534928#73534928

# import torch

# !pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
from karateclub import GraRep 
from ogb.nodeproppred import PygNodePropPredDataset 
from scipy.sparse import coo_matrix, identity
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from torch_geometric.utils import to_networkx
from typing import List 
from tqdm import tqdm 

import math 
import matplotlib.pyplot as plt 
import networkx as nx 
import numpy as np 
import pandas as pd 
import random 
import torch 


In [None]:
def sparse_identity_matrix(dim, dtype=np.float32):
    A = identity(dim, dtype=dtype) 
    A = coo_matrix(A)  
    return A 

def get_target_matrix(A):
    A_hat = sparse_identity_matrix(A.shape[0]) 
    scores = np.log(A_hat.data) - math.log(A.shape[0]) 
    rows = A_hat.row[scores < 0]
    cols = A_hat.col[scores < 0]
    scores = scores[scores < 0]
    target_matrix = coo_matrix((scores, (rows, cols)),
                                        shape=A.shape,
                                        dtype=np.float32) 
    return target_matrix

In [None]:
dimensions = 16 
max_nodes = None # select a subset of nodes for large datasets
node_sampling_stategy = 'sequential' # ('random', 'sequential')
order = 5 # K
iterations = 20 # number of SVD iterations 
random_state = 42 
test_size = 0.2 

In [None]:
dataset = PygNodePropPredDataset('ogbn-arxiv') 
G = to_networkx(dataset.data, to_undirected=False)
if max_nodes is not None:
    if node_sampling_stategy=='random':
        subset_nodes = random.sample(G.nodes, max_nodes)
    else:
        subset_nodes = list(G.nodes)[:max_nodes] 
    G = G.subgraph(subset_nodes) 
    mapping = {v:w for w,v in enumerate(sorted(G))}
    G = nx.relabel_nodes(G, mapping) 

In [None]:
# calculate the inverse degree matrix 
ind = range(len(G.nodes())) 
degs = [1.0/G.degree(node) for node in range(G.number_of_nodes())] 
D_1 = coo_matrix((degs, (ind, ind)),
                         shape=(G.number_of_nodes(), 
                         G.number_of_nodes()), 
                         dtype=np.float32) 

# (1-step) probability transition matrix 
A = coo_matrix(nx.adjacency_matrix(G), dtype=np.float32)
A = A.dot(D_1)   

ZeroDivisionError: ignored

In [None]:
# calculate embeddings 
 
embeddings = []
for step in tqdm(range(order)): 
    target_matrix = get_target_matrix(A) 

    svd = TruncatedSVD(n_components=dimensions,
                        n_iter=iterations,
                        random_state=random_state)

    svd.fit(target_matrix)
    embedding = svd.transform(target_matrix)
    embeddings.append(embedding)
implemented_embeddings = np.concatenate(np.array(embeddings),axis=1) 

In [None]:
y = dataset.data['y'] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(implemented_embeddings, y, random_state=random_state, test_size=test_size) 
model = LogisticRegression(max_iter=int(1e5)) 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred)) 

In [None]:
# karateclub model

model = GraRep(dimensions=dimensions, iteration=iterations, order=order, seed=random_state) 
model.fit(G) 
karateclub_embeddings = model.get_embedding() 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(karateclub_embeddings, y, random_state=random_state, test_size=test_size) 
model = LogisticRegression() 
model.fit(X_train, y_train) 
y_pred = model.predict(X_test) 
print(accuracy_score(y_test,y_pred))