# Libs

In [None]:
import os

import torch
import torch.nn as nn

import pandas as pd
import numpy as np
import json

from tqdm import tqdm
from datetime import datetime
import time
from collections import deque
import matplotlib.pyplot as plt

In [None]:
import torch_geometric
from torch_geometric.data import Data, DataLoader
import torch_geometric.utils as utils

In [None]:
import networkit as nk
import networkx as nx

# Setting

In [None]:
class Setting:
    _root = os.getcwd()

    _data = os.path.join(_root, "hw1_data")

    data_split = os.path.join(_root, "split.json")

    # testset datapath
    data_synthetic = os.path.join(_data, "Synthetic", "5000")
    
    data_youtube = os.path.join(_data, "Real", "youtube")
    data_amazon = os.path.join(_data, "Real", "amazon")
    data_dblp = os.path.join(_data, "Real", "dblp")
    data_comlj = os.path.join(_data, "Real", "com-lj")
    
    # trainset datapath
    data_train = os.path.join(_data, "train")
    # validset datapath
    data_valid = os.path.join(_data, "valid")
    
    
    # Setting of training
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    epochs = 500
    nodes_cnt = 200
    
    params_drbc = dict(
        # according to source paper
        encoder_params = dict(
            c = 3,
            p = 128,
            num_layers = 5,
            device = device
        ),
        decoder_params = dict(
            p = 128,
            q = 64
        )
    )

    
    
    def __init__(self, date_time=None):
        
    
        # Create dir for train/test
        if date_time is None:
            date_time = datetime.strftime(datetime.now(), "%Y-%m-%d %H-%M")
        self.root = os.path.join(self._root, "result", date_time)
        if os.path.exists(self.root):
            pass
        else:
            os.makedirs(self.root)


        ### Save plt info
        self.train_info_p = os.path.join(self.root, "train.json")
        self.val_info_p = os.path.join(self.root, "valid.json")
        self.test_info_p = os.path.join(self.root, "test.json")

        ### Save plt img
        self.result_plt_p = os.path.join(self.root, "train_plt.png")
        self.test_plt_p = os.path.join(self.root, "test_plt.png")
    

        self.weight_drbc = os.path.join(self.root, "drbc.pth")
    

#### Usage

In [None]:
date_time = datetime.strftime(datetime.now(), "%Y-%m-%d %H-%M")
# date_time = "2021-03-23 00-55"

In [None]:
setting = Setting(d_name=date_time)

setting.root

# Data functions

In [None]:
def load_data(path, mode="between"):

    assert mode=="between" or mode=="closeness", "Unknown centrality mode."
    
    edge_index = []
    centrality = []
    for f in os.listdir(os.path.join(path, "graph")):

        p = os.path.join(path, f)
        edge_index.append(p)

        p = os.path.join(path, mode, f)
        centrality.append(p)
    
    return edge_index, centrality

In [None]:
from sklearn.model_selection import train_test_split


def split_data(path, x, y, replace=False):
    if os.path.exists(path) and replace:
        pass
    else:
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15)

        split = {
            "X_train": X_train,
            "X_valid": X_valid,
            "X_test": X_test,
            "y_train": y_train,
            "y_valid": y_valid,
            "y_test": y_test
        }

        with open(path, 'w') as f:
            json.dump(split, f)

### Cvt 2 Pyg.Data

In [None]:
def to_data(x, y=None, x_sep=None, y_sep=None, usecols=None):
    
    if x_sep is not None:
        edge_index = torch_geometric.io.read_txt_array(x, dtype=torch.long, sep=x_sep)
    else:
        edge_index = torch_geometric.io.read_txt_array(x, dtype=torch.long)
    edge_index = edge_index.t().contiguous()
    edge_index = utils.to_undirected(edge_index)

    row, col = edge_index  
    deg = utils.degree(col) # must use col to get degree, why?
    deg = deg.numpy()  

    vertice = []
    for d in deg:
        vertice.append([d, 1, 1])
    vertice = np.array(vertice, dtype=np.float)
    vertice = torch.from_numpy(vertice)

    if y is not None:
        ### between centrality
        score = np.loadtxt(y, delimiter=y_sep, usecols=usecols)
        score = np.reshape(score, (-1, 1))
        score = torch.from_numpy(score)

        data = Data(x=vertice, edge_index=edge_index, y=score)
        
    else:
        data = Data(x=vertice, edge_index=edge_index)
    
    return data

### Cvt 2 Pyg.Dataloader

In [None]:
def to_dataloader(x, y, batch, y_sep=None, usecols=None):
    
    data_list = []
    for x_, y_ in zip(x, y):
        data = to_data(x_, y_, y_sep=y_sep, usecols=usecols)
        data_list.append(data)

    loader = DataLoader(data_list, batch_size=batch)
    return loader

### From networkx

In [None]:
def from_networkx(G, score_list=None):
    r"""Converts a :obj:`networkx.Graph` or :obj:`networkx.DiGraph` to a
    :class:`torch_geometric.data.Data` instance.

    Args:
        G (networkx.Graph or networkx.DiGraph): A networkx graph.
    """

    G = nx.convert_node_labels_to_integers(G)
    G = G.to_directed() if not nx.is_directed(G) else G
    edge_index = torch.LongTensor(list(G.edges)).t().contiguous()
   
    data = {}

    for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
        for key, value in feat_dict.items():
           
            data[str(key)] = [value] if i == 0 else data[str(key)] + [value]

    for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
        for key, value in feat_dict.items():
            data[str(key)] = [value] if i == 0 else data[str(key)] + [value]

    for key, item in data.items():
        try:
            data[key] = torch.tensor(item)
        except ValueError:
            pass

    data['edge_index'] = edge_index.view(2, -1)
    data['x'] = torch.from_numpy(
        np.array( [ [G.degree[i], 1, 1] for i in G.nodes()], dtype=np.float ) )
    
    if score_list is not None:
        data['y'] = torch.from_numpy(
            np.array( [ [b] for b in score_list ] , dtype=np.float) )
    data = torch_geometric.data.Data.from_dict(data)
    data.num_nodes = G.number_of_nodes()

    return data

### Random generate data

In [None]:
def generate_nx_graph(nodes_cnt):
    # Draw network G from distribution D (like the power-law model)
    G = nx.generators.random_graphs.powerlaw_cluster_graph(n=nodes_cnt, m=4, p=0.05)
    # Calculate each node’s exact BC value bv, ∀v ∈ V
    betweenness = nx.algorithms.centrality.betweenness_centrality(G)
    
    # Convert betweenness dict to list
    between = [v for k, v in sorted(betweenness.items(), key=lambda  item: int(item[0]), reverse=False)]
    bc = np.array(between)
    
    closeness = nx.algorithms.centrality.closeness_centrality(G)
    closeness = [v for k, v in sorted(closeness.items(), key=lambda item: int(item[0]), reverse=False)]
    cc = np.array(closeness)
    
    return G, bc, cc

#### Usage

Ex. generate 10,000 to train/

In [None]:
to_dir = os.path.join(Setting._data, "train", str(nodes_cnt))

if os.path.exists(to_dir):
    pass
else:
    os.makedirs(to_dir)

In [None]:
g_dir = os.path.join(to_dir, "graph")
bc_dir = os.path.join(to_dir, "between")
cc_dir = os.path.join(to_dir, "closeness")

if os.path.exists(g_dir):
    pass
else:
    os.makedirs(g_dir)
    
if os.path.exists(bc_dir):
    pass
else:
    os.makedirs(bc_dir)
    
if os.path.exists(cc_dir):
    pass
else:
    os.makedirs(cc_dir)

In [None]:
for i in range(10000):
    G, bc, cc = generate_nx_graph(Setting.nodes_cnt)
    save_name = "{}.txt".format(i)
    
    edge_f = open(os.path.join(g_dir, save_name), "wb")
    nx.readwrite.edgelist.write_edgelist(G, edge_f, data=False)
  
    np.savetxt(os.path.join(bc_dir, save_name), bc, fmt="%.20f")
 
    np.savetxt(os.path.join(cc_dir, save_name), cc, fmt="%.20f")

## Usage

In [None]:
# 1. Load data
train, train_score = load_data(os.path.join(Setting.data_train, str(Setting.nodes_cnt)), mode="closeness")

# 2. Cvt 2 dataloader
train_loader = to_dataloader(train, train_score, batch=2)

# Model

## Evaluation

### Top N %

In [None]:
def top_n_percentage(bc_gt, bc_pr, k):
    
    if not isinstance(bc_gt, torch.Tensor):
        bc_gt = torch.from_numpy(bc_gt)
    bc_gt = bc_gt.to(setting.device)
    bc_gt = torch.reshape(bc_gt, (-1, ))
    
    if not isinstance(bc_pr, torch.Tensor):
        bc_pr = torch.from_numpy(bc_pr)
    bc_pr = bc_pr.to(setting.device)
    bc_pr = torch.reshape(bc_pr, (-1, ))
    
    nodes = bc_gt.size()[0]
    k = int(nodes * k / 100)
    
    gt_value, gt_indice = torch.topk(bc_gt, k)
    pr_value, pr_indice = torch.topk(bc_pr, k)

    gt_indice = set(gt_indice.cpu().numpy())
    pr_indice = set(pr_indice.cpu().numpy())

    intersect = len(gt_indice & pr_indice)
    top = intersect/k
    
    return top

### Kendal Tau Distance

In [None]:
from scipy import stats

def kendal_tau_distance(bc_gt, bc_pr):
    
    if isinstance(bc_gt, torch.Tensor):
        bc_gt = torch.reshape(bc_gt, (-1, ))
        bc_gt = bc_gt.cpu().detach().numpy()
        
    if isinstance(bc_pr, torch.Tensor):
        bc_pr = torch.reshape(bc_pr, (-1, ))
        bc_pr = bc_pr.cpu().detach().numpy()
    
    tau, p_value = stats.kendalltau(bc_gt, bc_pr)
    return tau

### Load Model

In [None]:
def load_checkpoint(filepath, device, **params):

    model = DrBC(**params["drbc"])
    
    model = model.to(device)
    
    if os.path.exists(filepath):
        print("pretrained finded")
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_stat'])
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        optimizer.load_state_dict(checkpoint['optimizer_stat'])

    else:
        print("use a new optimizer")
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    return model, optimizer

#### Usage

In [None]:
model, optimizer = load_checkpoint(
                    setting.weight_drbc,
                    Setting.device,
                    drbc = Setting.params_drbc)

model

### Evaluation

In [None]:
def eval_model(model, dataloader, device):
    
    model = model.eval().to(device)
    
    top1_list = []
    top5_list = []
    top10_list = []
    kendal_list = []
    loss_list = []
    time_list = []

    for batch in tqdm(dataloader):

        batch = batch.to(device)
        
        start = time.time()
        bc_pr = model(batch)
        end = time.time()

        b_index = batch.batch.cpu().numpy()
        b = np.max(b_index) + 1

        for b_ in range(b):

            indice, = np.where(b_index == b_)


            gt = batch.y[indice].squeeze()
            pr = bc_pr[indice].squeeze()

            # evaluation
            top1 = top_n_percentage(gt, pr, k=1)
            top5 = top_n_percentage(gt, pr, k=5)
            top10 = top_n_percentage(gt, pr, k=10)
            kendal = kendal_tau_distance(gt, pr)

            # compute loss
            src = np.random.choice(len(indice), 5*len(indice), replace=True)
            det = np.random.choice(len(indice), 5*len(indice), replace=True)
            src = torch.from_numpy(src)
            det = torch.from_numpy(det)

            y_gt = gt[det] - gt[src]
            y_pr = pr[det] - pr[src]

            y_gt = nn.Sigmoid()(y_gt)
            y_pr = nn.Sigmoid()(y_pr)

            loss = nn.BCELoss()(y_pr, y_gt)

            top1_list.append(top1)
            top5_list.append(top5)
            top10_list.append(top10)
            kendal_list.append(kendal)
            loss_list.append(loss.item())
            time_list.append(end-start)
    
    
    return top1_list, top5_list, top10_list, kendal_list, time_list, loss_list

# Baselines

## RK(DIAM)

ApproxBetweenness

Fast approximation of betweenness centrality through sampling.

In [None]:
def run_rk(save_path, edge_lists, scores):

    with open(save_path, "w") as f:
        jsn = {
            "top1": [],
            "top5": [],
            "top10": [],
            "kendal": [],
            "time": []
        }
        for edge_list, score in zip(edge_lists, scores):
            G_nx = nx.readwrite.edgelist.read_edgelist(edge_list, delimiter="\t")
            G_nk = nk.nxadapter.nx2nk(G_nx)

            method = nk.centrality.ApproxBetweenness(G_nk, epsilon=0.1)
            start = time.time()
            method.run()
            end = time.time()
            
            gt = np.loadtxt(score, usecols=1)
                
            jsn["time"].append(end-start)
            jsn["kendal"].append(kendal_tau_distance(np.array(method.scores()), gt))
            jsn["top1"].append(top_n_percentage(np.array(method.scores()), gt, k=1))
            jsn["top5"].append(top_n_percentage(np.array(method.scores()), gt, k=5))
            jsn["top10"].append(top_n_percentage(np.array(method.scores()), gt, k=10))

        json.dump(jsn, f)

In [None]:
# run_rk(os.path.join(setting.root, "rk.json"), edge_lists=xxx, scores=xxx)

## KADABRA

In [None]:
def run_kadabra(save_path, edge_lists, scores):

    with open(save_path, "w") as f:
        jsn = {
            "top1": [],
            "top5": [],
            "top10": [],
            "kendal": [],
            "time": []
        }
        for edge_list, score in zip(edge_lists, scores):
            G_nx = nx.readwrite.edgelist.read_edgelist(edge_list, delimiter="\t")
            G_nk = nk.nxadapter.nx2nk(G_nx)
            
            method = nk.centrality.KadabraBetweenness(G_nk, 0.05, 0.8)
            start = time.time()
            method.run()
            end = time.time()

            gt = np.loadtxt(score, usecols=1)
            
            jsn["time"].append(end-start)
            jsn["kendal"].append(kendal_tau_distance(np.array(method.scores()), gt))
            jsn["top1"].append(top_n_percentage(np.array(method.scores()), gt, k=1))
            jsn["top5"].append(top_n_percentage(np.array(method.scores()), gt, k=5))
            jsn["top10"].append(top_n_percentage(np.array(method.scores()), gt, k=10))

        json.dump(jsn, f)

In [None]:
# run_rk(os.path.join(setting.root, "kadabra.json"), edge_lists=xxx, scores=xxx)

## KBC

clone https://github.com/ecrc/BeBeCA and run kbc

In [None]:
def run_kbc(save_path, edge_lists, scores):
    
    import subprocess
    with open(save_path, "w") as f:
        jsn = {
            "top1": [],
            "top5": [],
            "top10": [],
            "kendal": [],
            "time": []
        }
        for edge_list, score in zip(edge_lists, scores):

            base = os.path.splitext(os.path.basename(edge_list))[0]

            G_nx = nx.readwrite.edgelist.read_edgelist(edge_list, delimiter="\t")
            
            # Cvt 2 pyg.Data to get nodes_cnt and edges_cnt
            G_pyg = from_networkx(G_nx)

            arr = np.array([G_pyg.x.shape[0], G_pyg.edge_index.shape[1]]).reshape((1, 2))
            arr = np.concatenate([arr, G_pyg.edge_index.t().numpy()])

            save = os.path.join("BeBeCA/Source_Code/5000", "{}.txt".format(base))
            save_pr = os.path.join("BeBeCA/Source_Code/5000", "{}_pr.txt".format(base))

            np.savetxt(save, arr, fmt="%d")

            start = time.time()
            subprocess.run(["./BeBeCA/Source_Code/KPATH", "2", save, save_pr])
            end = time.time()

            pr = np.loadtxt(save_pr, delimiter=":", usecols=1)
            gt = np.loadtxt(score, usecols=1)

            jsn["time"].append(end-start)
            jsn["kendal"].append(kendal_tau_distance(gt, pr))
            jsn["top1"].append(top_n_percentage(gt, pr, k=1))
            jsn["top5"].append(top_n_percentage(gt, pr, k=5))
            jsn["top10"].append(top_n_percentage(gt, pr, k=10))

        json.dump(jsn, f)

In [None]:
# run_rk(os.path.join(setting.root, "kbc.json"), edge_lists=xxx, scores=xxx)

## DrBC

In [None]:
def run_drbc(model, save_path, edge_lists, scores):
    with open(os.path.join(setting.root, "drbc.json"), "w") as f:

        data_loader = to_dataloader(edge_lists, scores)

        top1_list, top5_list, top10_list, kendal_list, time_list, loss_list = eval_model(model, data_loader, "cpu")
        
        jsn = {
            "top1": top1_list,
            "top5": top5_list,
            "top10": top10_list,
            "kendal": kendal_list,
            "time": time_list
        }

        json.dump(jsn, f)

In [None]:
# run_rk(os.path.join(setting.root, "drbc.json"), edge_lists=xxx, scores=xxx)