In [1]:
import argparse
import torch
from dgl.data import register_data_args
import logging
#import fire
from optim import trainer, TUtrainer, AEtrainer
from optim.loss import loss_function,init_center
from datasets import dataloader,TUloader
from networks.init import init_model
import numpy as np
import torch
from dgl import random as dr

def main(args):
    if args.seed!=-1:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        #torch.backends.cudnn.deterministic=True
        dr.seed(args.seed)

    checkpoints_path=f'./checkpoints/{args.dataset}+OC-{args.module}+bestcheckpoint.pt'
    logging.basicConfig(filename=f"./log/{args.dataset}+OC-{args.module}.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
    logger=logging.getLogger('OCGNN')


#     print('model:',args.module)
#     print('seed:',args.seed)

    if args.dataset in 'PROTEINS_full'+'ENZYMES'+'FRANKENSTEIN':
        train_loader, val_loader, test_loader, input_dim, label_dim=TUloader.loader(args)
        model=init_model(args,input_dim)
        model=TUtrainer.train(args,logger,train_loader,model,val_dataset=val_loader,path=checkpoints_path)
        # auc,ap,f1,acc,precision,recall,_= multi_graph_evaluate(args,checkpoints_path,
        #     model, data_center,test_loader,radius,mode='test')

        # torch.cuda.empty_cache()
        # print("Test AUROC {:.4f} | Test AUPRC {:.4f}".format(auc,ap))
        # print(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
        # logger.info("Current epoch: {:d} Test AUROC {:.4f} | Test AUPRC {:.4f}".format(epoch,auc,ap))
        # logger.info(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
        # logger.info('\n')
    else:
        data=dataloader.loader(args)
        model=init_model(args,data['input_dim'])
        if args.module != 'GAE':
            model=trainer.train(args,logger,data,model,checkpoints_path)
        else:
            model=AEtrainer.train(args,logger,data,model,checkpoints_path)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='OCGNN')
    register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.5,
            help="dropout probability")
    parser.add_argument("--nu", type=float, default=0.2,
            help="hyperparameter nu (must be 0 < nu <= 1)")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--seed", type=int, default=52,
            help="random seed, -1 means dont fix seed")
    parser.add_argument("--module", type=str, default='GraphSAGE',
            help="GCN/GAT/GIN/GraphSAGE/GAE")
    parser.add_argument('--n-worker', type=int,default=1,
            help='number of workers when dataloading')
    parser.add_argument('--batch-size', type=int,default=128,
            help='batch size')
    parser.add_argument("--lr", type=float, default=1e-3,
            help="learning rate")
    parser.add_argument("--normal-class", type=int, default=0,
            help="normal class")
    parser.add_argument("--n-epochs", type=int, default=5000,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=32,
            help="number of hidden gnn units")
    parser.add_argument("--n-layers", type=int, default=2,
            help="number of hidden gnn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    parser.add_argument('--early-stop', action='store_true', default=False,
                        help="indicates whether to use early stop or not")
    parser.add_argument("--self-loop", action='store_true',
            help="graph self-loop (default=False)")
    parser.add_argument("--norm", action='store_true',
            help="graph normalization (default=False)")
    parser.set_defaults(self_loop=True)
    parser.set_defaults(norm=False)
    args = parser.parse_args()
    if args.module=='GCN':
        #args.self_loop=True
        args.norm=True
    if args.module=='GAE':
        args.lr=0.002
        args.dropout=0.
        args.weight_decay=0.
        # args.n_hidden=32
    #     args.self_loop=True
    # if args.module=='GraphSAGE':
    #     args.self_loop=True

    if args.dataset in ('citeseer' + 'reddit'):
        args.normal_class=3
    if args.dataset in ('cora' + 'pubmed'):
        args.normal_class=2
    if args.dataset in 'TU_PROTEINS_full':
        args.normal_class=0

    #fire.Fire(main(args))
    main(args)


usage: ipykernel_launcher.py [-h] [--dataset DATASET] [--dropout DROPOUT]
                             [--nu NU] [--gpu GPU] [--seed SEED]
                             [--module MODULE] [--n-worker N_WORKER]
                             [--batch-size BATCH_SIZE] [--lr LR]
                             [--normal-class NORMAL_CLASS]
                             [--n-epochs N_EPOCHS] [--n-hidden N_HIDDEN]
                             [--n-layers N_LAYERS]
                             [--weight-decay WEIGHT_DECAY] [--early-stop]
                             [--self-loop] [--norm]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/kennethjohn/Library/Jupyter/runtime/kernel-c2f3fa66-933a-4f28-8e22-6dc2fc6b7161.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import argparse
from dgl.data import register_data_args
import time
from datasets.dataloader import emb_dataloader
from utils.evaluate import baseline_evaluate
import fire
import logging
from embedding.get_embedding import embedding
from pyod.models.ocsvm import OCSVM
from pyod.models.iforest import IForest
from pyod.models.pca import PCA
from pyod.models.auto_encoder import AutoEncoder


import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score,precision_score,recall_score,average_precision_score,roc_auc_score,roc_curve

def main():
	parser = argparse.ArgumentParser(description='baseline')
	register_data_args(parser)
	parser.add_argument("--mode", type=str, default='A',choices=['A','AX','X'],
			help="dropout probability")
	parser.add_argument("--seed", type=int, default=-1,
            help="random seed, -1 means dont fix seed")
	parser.add_argument("--emb-method", type=str, default='DeepWalk',
			help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec")  
	parser.add_argument("--ad-method", type=str, default='OCSVM',
			help="embedding methods: PCA,OCSVM,IF,AE")     
	parser.add_argument("--normal-class", type=int, default=0,
            help="normal class")       
	args = parser.parse_args()

	if args.dataset in ('citeseer' + 'reddit'):
		args.normal_class=3
	if args.dataset in ('cora' + 'pubmed'):
		args.normal_class=2
	if args.dataset in 'TU_PROTEINS_full':
		args.normal_class=0

	if args.seed!=-1:
		np.random.seed(args.seed)
		torch.manual_seed(args.seed)
		torch.cuda.manual_seed_all(args.seed)

	logging.basicConfig(filename=f"./log/{args.dataset}-twostage-{args.seed}.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
	logger=logging.getLogger('baseline')


	datadict=emb_dataloader(args)

	if args.mode=='X':
		data=datadict['features']
		#print('X shape',data.shape)
	else:
		t0 = time.time()
		embeddings=embedding(args,datadict)
		dur1=time.time() - t0
		
		if args.mode=='A':
			data=embeddings
			#print('A shape',data.shape)
		if args.mode=='AX':
			data=np.concatenate((embeddings,datadict['features']),axis=1)
			#print('AX shape',data.shape)

	logger.debug(f'data shape: {data.shape}')

	if args.ad_method=='OCSVM':
		clf = OCSVM(contamination=0.1)
	if args.ad_method=='IF':
		clf = IForest(n_estimators=100,contamination=0.1,n_jobs=-1,behaviour="new")
	if args.ad_method=='PCA':
		clf = PCA(contamination=0.1)
	if args.ad_method=='AE':
		clf = AutoEncoder(contamination=0.1)
	


	t1 = time.time()
	clf.fit(data[datadict['train_mask']])
	dur2=time.time() - t1

	print('traininig time:', dur1+dur2)

	logger.info('\n')
	logger.info('\n')
	logger.info(f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}')
	logger.info('-------------Evaluating Validation Results--------------')

	t2 = time.time()
	y_pred_val=clf.predict(data[datadict['val_mask']])
	y_score_val=clf.decision_function(data[datadict['val_mask']])
	auc,ap,f1,acc,precision,recall=baseline_evaluate(datadict,y_pred_val,y_score_val,val=True)
	dur3=time.time() - t2
	print('infer time:', dur3)

	logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
	logger.info(f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')

	logger.info('-------------Evaluating Test Results--------------')
	y_pred_test=clf.predict(data[datadict['test_mask']])
	y_score_test=clf.decision_function(data[datadict['test_mask']])
	auc,ap,f1,acc,precision,recall=baseline_evaluate(datadict,y_pred_test,y_score_test,val=False)
	logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
	logger.info(f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')


if __name__ == '__main__':

    #print(args)
	#main()
    fire.Fire(main)


In [None]:
import time
import numpy as np
import torch
import logging
#from dgl.contrib.sampling.sampler import NeighborSampler
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score,precision_score,recall_score,average_precision_score,roc_auc_score,roc_curve


from optim.loss import EarlyStopping

#choose mode of GAE, A means kipf's GAE, X means AE with considering network structure, AX means Ding's Dominant model.
# GAE_mode can be selected form 'AX', 'A' or 'X'.
GAE_mode='AX'


def train(args,logger,data,model,path):

    checkpoints_path=path

    # logging.basicConfig(filename=f"./log/{args.dataset}+OC-{args.module}.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
    # logger=logging.getLogger('OCGNN')
    #loss_fcn = torch.nn.CrossEntropyLoss()
    # use optimizer AdamW
    logger.info('Start training')
    logger.info(f'dropout:{args.dropout}, nu:{args.nu},seed:{args.seed},lr:{args.lr},self-loop:{args.self_loop},norm:{args.norm}')

    logger.info(f'n-epochs:{args.n_epochs}, n-hidden:{args.n_hidden},n-layers:{args.n_layers},weight-decay:{args.weight_decay}')

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    if args.early_stop:
        stopper = EarlyStopping(patience=100)
    # initialize data center

    adj=data['g'].adjacency_matrix().to_dense().cuda()
    loss_fn = nn.MSELoss()
    #train_inputs=data['features']
    #print('adj dim',adj[data['train_mask']].size())

    dur = []
    model.train()

    #创立矩阵以存储结果曲线
    arr_epoch=np.arange(args.n_epochs)
    arr_loss=np.zeros(args.n_epochs)
    arr_valauc=np.zeros(args.n_epochs)
    arr_testauc=np.zeros(args.n_epochs)

    for epoch in range(args.n_epochs):
        #model.train()
        #if epoch %5 == 0:
        t0 = time.time()
        # forward

        z,re_x,re_adj= model(data['g'],data['features'])

        loss=Recon_loss(re_x,re_adj,adj,data['features'],data['train_mask'],loss_fn,GAE_mode)

        #保存训练loss
        arr_loss[epoch]=loss.item()
        #

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur=time.time() - t0
        
        auc,ap,val_loss=fixed_graph_evaluate(args,model,data,adj,data['val_mask'])
        #保存验证集AUC
        arr_valauc[epoch]=auc
        #保存验证集AUC
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Val AUROC {:.4f} | Val loss {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item()*100000,
                                            auc,val_loss, data['n_edges'] / np.mean(dur) / 1000))
        if args.early_stop:
            if stopper.step(auc,val_loss.item(), model,epoch,checkpoints_path):   
                break

    if args.early_stop:
        print('loading model before testing.')
        model.load_state_dict(torch.load(checkpoints_path))

        #if epoch%100 == 0:
    
    auc,ap,_ = fixed_graph_evaluate(args,model,data,adj,data['test_mask'])
    test_dur=0
    #保存测试集AUC
    arr_testauc[epoch]=auc
    #保存测试集AUC
    print("Test Time {:.4f} | Test AUROC {:.4f} | Test AUPRC {:.4f}".format(test_dur,auc,ap))
    #print(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    #logger.info("Current epoch: {:d} Test AUROC {:.4f} | Test AUPRC {:.4f}".format(epoch,auc,ap))
    #logger.info(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    #logger.info('\n')

    #np.savez('Dom3.npz',epoch=arr_epoch,loss=arr_loss,valauc=arr_valauc,testauc=arr_testauc)

    return model

def Recon_loss(re_x,re_adj,adj,x,mask,loss_fn,mode):
    #S_loss: structure loss A_loss: Attribute loss
    if mode=='A':
        return loss_fn(re_x[mask], x[mask])
    if mode=='X':
        return loss_fn(re_x[mask], x[mask]) 
    if mode=='AX':     
        return 0.5*loss_fn(re_x[mask], x[mask]) + 0.5*loss_fn(re_adj[mask], adj[mask])

def anomaly_score(re_x,re_adj,adj,x,mask,loss_fn,mode):
    if mode=='A':
        S_scores=F.mse_loss(re_adj[mask], adj[mask], reduction='none')
        return torch.mean(S_scores,1)
    if mode=='X':
        A_scores=F.mse_loss(re_x[mask], x[mask], reduction='none')
        return torch.mean(A_scores,1)
    if mode=='AX': 
        A_scores=F.mse_loss(re_x[mask], x[mask], reduction='none')
        S_scores=F.mse_loss(re_adj[mask], adj[mask], reduction='none')
        return 0.5*torch.mean(A_scores,1)+0.5*torch.mean(S_scores,1)

def fixed_graph_evaluate(args,model,data,adj,mask):
    loss_fn = nn.MSELoss()

    model.eval()
    with torch.no_grad():
        labels = data['labels'][mask]
        
        loss_mask=mask.bool() & data['labels'].bool()

        #test_t0=time.time()
        z,re_x,re_adj= model(data['g'],data['features'])

        loss=Recon_loss(re_x,re_adj, adj, data['features'],loss_mask,loss_fn,GAE_mode)
        #test_dur = time.time()-test_t0
        #print("Test Time {:.4f}".format(test_dur))
        #print(recon[data['val_mask']].size())
        scores=anomaly_score(re_x,re_adj, adj, data['features'],mask,loss_fn,GAE_mode)
        
        # A_scores=F.mse_loss(re_x[mask], data['features'][mask], reduction='none')
        # S_scores=F.mse_loss(re_adj[mask], adj[mask], reduction='none')
        # scores=torch.mean(A_scores,1)+torch.mean(S_scores,1)

        labels=labels.cpu().numpy()
        # print(labels.shape)
        # print(scores.shape)
        #dist=dist.cpu().numpy()
        scores=scores.cpu().numpy()
        #pred=thresholding(scores,0)
        #print('scores.shape',scores)
        auc=roc_auc_score(labels, scores)
        ap=average_precision_score(labels, scores)

        # acc=accuracy_score(labels,pred)
        # recall=recall_score(labels,pred)
        # precision=precision_score(labels,pred)
        # f1=f1_score(labels,pred)


    return auc,ap,loss

In [None]:
import torch    
import numpy as np
import torch.nn.functional as F
    
def loss_function(nu,data_center,outputs,radius=0,mask=None):
    dist,scores=anomaly_score(data_center,outputs,radius,mask)
    loss = radius ** 2 + (1 / nu) * torch.mean(torch.max(torch.zeros_like(scores), scores))
    return loss,dist,scores

def anomaly_score(data_center,outputs,radius=0,mask= None):
    if mask == None:
        dist = torch.sum((outputs - data_center) ** 2, dim=1)
    else:
        dist = torch.sum((outputs[mask] - data_center) ** 2, dim=1)
    # c=data_center.repeat(outputs[mask].size()[0],1)
    # res=outputs[mask]-c
    # res=torch.mean(res, 1, keepdim=True)
    # dist=torch.diag(torch.mm(res,torch.transpose(res, 0, 1)))

    scores = dist - radius ** 2
    return dist,scores

def init_center(args,input_g,input_feat, model, eps=0.001):
    """Initialize hypersphere center c as the mean from an initial forward pass on the data."""
    if args.gpu < 0:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:%d' % args.gpu)
    n_samples = 0
    c = torch.zeros(args.n_hidden, device=device)

    model.eval()
    with torch.no_grad():

        outputs= model(input_g,input_feat)

        # get the inputs of the batch

        n_samples = outputs.shape[0]
        c =torch.sum(outputs, dim=0)

    c /= n_samples

    # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights.
    c[(abs(c) < eps) & (c < 0)] = -eps
    c[(abs(c) < eps) & (c > 0)] = eps

    return c


def get_radius(dist: torch.Tensor, nu: float):
    """Optimally solve for radius R via the (1-nu)-quantile of distances."""
    radius=np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - nu)
    # if radius<0.1:
    #     radius=0.1
    return radius

class EarlyStopping:
    def __init__(self, patience=10):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.best_epoch = None
        self.lowest_loss = None
        self.early_stop = False

    def step(self, acc,loss, model,epoch,path):
        score = acc
        cur_loss=loss
        if (self.best_score is None) or (self.lowest_loss is None):
        #if self.lowest_loss is None:
            self.best_score = score
            self.lowest_loss = cur_loss
            self.save_checkpoint(acc,loss,model,path)
        #elif cur_loss > self.lowest_loss:
        elif (score < self.best_score) and (cur_loss > self.lowest_loss):
            self.counter += 1
            if self.counter >= 0.8*(self.patience):
                print(f'Warning: EarlyStopping soon: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.lowest_loss = cur_loss
            self.best_epoch = epoch
            self.save_checkpoint(acc,loss,model,path)
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, acc,loss,model,path):
        '''Saves model when validation loss decrease.'''
        print('model saved. loss={:.4f} AUC={:.4f}'. format(loss,acc))
        torch.save(model.state_dict(), path)

In [None]:
import time
import numpy as np
import torch
import logging
#from dgl.contrib.sampling.sampler import NeighborSampler
# import torch.nn as nn
# import torch.nn.functional as F



from optim.loss import loss_function,init_center,get_radius,EarlyStopping

from utils import fixed_graph_evaluate

def train(args,logger,data,model,path):
    if args.gpu < 0:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:%d' % args.gpu)
    checkpoints_path=path

    # logging.basicConfig(filename=f"./log/{args.dataset}+OC-{args.module}.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
    # logger=logging.getLogger('OCGNN')
    #loss_fcn = torch.nn.CrossEntropyLoss()
    # use optimizer AdamW
    logger.info('Start training')
    logger.info(f'dropout:{args.dropout}, nu:{args.nu},seed:{args.seed},lr:{args.lr},self-loop:{args.self_loop},norm:{args.norm}')

    logger.info(f'n-epochs:{args.n_epochs}, n-hidden:{args.n_hidden},n-layers:{args.n_layers},weight-decay:{args.weight_decay}')

    optimizer = torch.optim.AdamW(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    if args.early_stop:
        stopper = EarlyStopping(patience=100)
    # initialize data center

    input_feat=data['features']
    input_g=data['g']

    data_center= init_center(args,input_g,input_feat, model)
    radius=torch.tensor(0, device=device)# radius R initialized with 0 by default.


    #创立矩阵以存储结果曲线
    arr_epoch=np.arange(args.n_epochs)
    arr_loss=np.zeros(args.n_epochs)
    arr_valauc=np.zeros(args.n_epochs)
    arr_testauc=np.zeros(args.n_epochs)

    dur = []
    model.train()
    for epoch in range(args.n_epochs):
        #model.train()
        #if epoch %5 == 0:
        t0 = time.time()
        # forward

        outputs= model(input_g,input_feat)
        #print('model:',args.module)
        #print('output size:',outputs.size())

        loss,dist,_=loss_function(args.nu, data_center,outputs,radius,data['train_mask'])
        #保存训练loss
        arr_loss[epoch]=loss.item()
        #
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch>= 3:
            dur.append(time.time() - t0)

        radius.data=torch.tensor(get_radius(dist, args.nu), device=device)


        auc,ap,f1,acc,precision,recall,val_loss = fixed_graph_evaluate(args,checkpoints_path, model, data_center,data,radius,data['val_mask'])
        #保存验证集AUC
        arr_valauc[epoch]=auc
        #保存测试集AUC
        print("Epoch {:05d} | Time(s) {:.4f} | Train Loss {:.4f} | Val Loss {:.4f} | Val AUROC {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item()*100000,
                                            val_loss.item()*100000, auc, data['n_edges'] / np.mean(dur) / 1000))
        print(f'Val f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
        if args.early_stop:
            if stopper.step(auc,val_loss.item(), model,epoch,checkpoints_path):
                break

    if args.early_stop:
        print('loading model before testing.')
        model.load_state_dict(torch.load(checkpoints_path))


    auc,ap,f1,acc,precision,recall,loss = fixed_graph_evaluate(args,checkpoints_path,model, data_center,data,radius,data['test_mask'])
    test_dur = 0
    #保存测试集AUC
    arr_testauc[epoch]=auc
    #保存测试集AUC
    print("Test Time {:.4f} | Test AUROC {:.4f} | Test AUPRC {:.4f}".format(test_dur,auc,ap))
    print(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    # logger.info("Current epoch: {:d} Test AUROC {:.4f} | Test AUPRC {:.4f}".format(epoch,auc,ap))
    # logger.info(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    # logger.info('\n')

    #np.savez('SAGE-2.npz',epoch=arr_epoch,loss=arr_loss,valauc=arr_valauc,testauc=arr_testauc)

    return model




In [None]:
import time
import numpy as np
import torch
import os
import torch.nn as nn
import logging
#from dgl.contrib.sampling.sampler import NeighborSampler
# import torch.nn as nn
# import torch.nn.functional as F



from optim.loss import loss_function,init_center,get_radius,EarlyStopping

from utils.evaluate import multi_graph_evaluate

def train(args, logger,dataset, model, val_dataset=None,path=None):
    '''
    training function
    '''
    checkpoints_path=path

    #loss_fcn = torch.nn.CrossEntropyLoss()
    # use optimizer AdamW
    logger.info('Start training')
    logger.info(f'dropout:{args.dropout}, nu:{args.nu},seed:{args.seed},lr:{args.lr},self-loop:{args.self_loop},norm:{args.norm}')

    logger.info(f'n-epochs:{args.n_epochs}, n-hidden:{args.n_hidden},n-layers:{args.n_layers},weight-decay:{args.weight_decay}')
    
    dataloader = dataset
    optimizer = torch.optim.AdamW(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    # optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad,
    #                                     model.parameters()), lr=0.001)
    if args.early_stop:
        stopper = EarlyStopping(patience=100)
    #early_stopping_logger = {"best_epoch": -1, "val_acc": -1}


    #data_center= init_center(args,input_g,input_feat, model)
    data_center= torch.zeros(args.n_hidden, device=f'cuda:{args.gpu}')
    radius=torch.tensor(0, device=f'cuda:{args.gpu}')# radius R initialized with 0 by default.
    #loss_fn = torch.nn.BCEWithLogitsLoss()
    model.train()
    for epoch in range(args.n_epochs):
        begin_time = time.time()
        # accum_correct = 0
        # total = 0
        print("EPOCH ###### {} ######".format(epoch))
        computation_time = 0.0
        for (batch_idx, (batch_graph, graph_labels)) in enumerate(dataloader):
            if torch.cuda.is_available():
                for (key, value) in batch_graph.ndata.items():
                    batch_graph.ndata[key] = value.cuda()
                #graph_labels = graph_labels.cuda()
            #print(batch_graph)
            train_mask=~batch_graph.ndata['node_labels'].bool().squeeze()
            model.zero_grad()
            compute_start = time.time()

            normlizing = nn.BatchNorm1d(batch_graph.ndata['node_attr'].shape[1], affine=False).cuda()
            input_attr=normlizing(batch_graph.ndata['node_attr'])

            # normlizing = nn.InstanceNorm1d(batch_graph.ndata['node_attr'].shape[1], affine=False).cuda()
            # input_attr=normlizing(batch_graph.ndata['node_attr'].unsqueeze(1)).squeeze()

            #data_center= init_center(args,batch_graph,batch_graph.ndata['node_attr'], model)
            #print('data_center',data_center)
            outputs = model(batch_graph,input_attr)
            # print('outputs mean',outputs.mean())
            # print('outputs std',outputs.std())

            #loss = loss_fn(outputs, batch_graph.ndata['node_labels'].float())
            loss,dist,score=loss_function(args.nu, data_center,outputs,radius,train_mask)
            #if batch_idx<=3:
                #print(dist)
                # print(score)
            # print('dist mean',dist.mean())
            # print('dist std',dist.std())
            loss.backward()
            batch_compute_time = time.time() - compute_start
            computation_time += batch_compute_time
            #nn.utils.clip_grad_norm_(model.parameters(), 2.0)
            optimizer.step()

            #radius.data=torch.tensor(get_radius(dist, args.nu), device=f'cuda:{args.gpu}')
            print('RRR',radius.data)
            print("Epoch {:05d},loss {:.4f} with {}-th batch time(s) {:.4f}".format(
            epoch, loss.item(), batch_idx, computation_time))
        #train_accu = accum_correct / total
        #print("train loss for this epoch {} is {}%".format(epoch,train_accu * 100))
        elapsed_time = time.time() - begin_time
        #print("Epoch {:05d}, loss {:.4f} with epoch time(s) {:.4f}".format(epoch,loss.item(), elapsed_time))
        if val_dataset is not None:
            auc,ap,f1,acc,precision,recall,loss = multi_graph_evaluate(args,checkpoints_path, model, data_center,val_dataset,radius,'val')
            print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Val AUROC {:.4f} | Val F1 {:.4f} | Val ACC {:.4f} | ". format(
                epoch, elapsed_time, loss.item()*100000, auc,f1,acc))
            torch.cuda.empty_cache()
            if args.early_stop:
                if stopper.step(auc,loss.item()*100000, model,epoch,checkpoints_path):  
                    print("best epoch is EPOCH {}, val_auc is {}%".format(stopper.best_epoch,
                                                        stopper.best_score)) 
                    break

    # auc,ap,f1,acc,precision,recall,_ = multi_graph_evaluate(args,checkpoints_path, model, data_center,data,radius,'test')
    # torch.cuda.empty_cache()
    # print("Test AUROC {:.4f} | Test AUPRC {:.4f}".format(auc,ap))
    # print(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    # logger.info("Current epoch: {:d} Test AUROC {:.4f} | Test AUPRC {:.4f}".format(epoch,auc,ap))
    # logger.info(f'Test f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}')
    # logger.info('\n')
    return model    


In [None]:
from dgl.data import load_data, tu
from dgl import DGLGraph, transforms
import torch
import torch.utils.data
import numpy as np
import torch
import dgl
import networkx as nx
from datasets.prepocessing import one_class_processing

def loader(args):
    # load and preprocess dataset
    
    data = load_data(args)

    print(f'normal_class is {args.normal_class}')

    labels,train_mask,val_mask,test_mask=one_class_processing(data,args.normal_class,args)
    features= data[0].ndata['feat']
    features = torch.FloatTensor(features)
    labels = torch.LongTensor(labels.numpy())
    train_mask = torch.BoolTensor(train_mask)
    val_mask = torch.BoolTensor(val_mask)
    test_mask = torch.BoolTensor(test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels
    n_edges = data[0].number_of_edges()
    print("args.gpu: ", args)
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.sum().item(),
              val_mask.sum().item(),
              test_mask.sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    g = data[0]
    
        
    # add self loop
    if args.self_loop:
        g.remove_edges(nx.selfloop_edges(g))
        #g=transform.remove_self_loop(g)
        #if args.module!='GraphSAGE':
        g.add_edges_from(zip(g.nodes(), g.nodes()))

    g = DGLGraph(g)
    g = g.to(torch.device('cuda:0'))
    n_edges = g.number_of_edges()
    if args.norm:
        
        # normalization
        degs = g.in_degrees().float()
        norm = torch.pow(degs, -0.5)
        norm[torch.isinf(norm)] = 0
        if cuda:
            norm = norm.cuda()
        g.ndata['norm'] = norm.unsqueeze(1)

    datadict={'g':g,'features':features,'labels':labels,'train_mask':train_mask,
        'val_mask':val_mask,'test_mask': test_mask,'input_dim':in_feats,'n_classes':n_classes,'n_edges':n_edges}

    return datadict

def emb_dataloader(args):
    # load and preprocess dataset
    data = load_data(args)
    normal_class=args.normal_class
    labels,train_mask,val_mask,test_mask=one_class_processing(data,normal_class,args)

    features = torch.FloatTensor(data.features)
    labels = torch.LongTensor(labels)
    train_mask = torch.BoolTensor(train_mask)
    val_mask = torch.BoolTensor(val_mask)
    test_mask = torch.BoolTensor(test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.sum().item(),
              val_mask.sum().item(),
              test_mask.sum().item()))

    g = data.graph


    datadict={'g':g,'features':features,'labels':labels,'train_mask':train_mask,
        'val_mask':val_mask,'test_mask': test_mask,'in_feats':in_feats,'n_classes':n_classes,'n_edges':n_edges}

    return datadict

In [None]:
import numpy as np
import tensorflow as tf

def one_class_processing(data,normal_class:int,args=None):
    labels,normal_idx,abnormal_idx=one_class_labeling(data[0].ndata['label'],normal_class)
    return one_class_masking(args,data,labels,normal_idx,abnormal_idx)


def one_class_labeling(labels,normal_class:int):
    normal_idx=np.where(labels==normal_class)[0]
    abnormal_idx=np.where(labels!=normal_class)[0]

    labels[normal_idx]=0
    labels[abnormal_idx]=1
    np.random.shuffle(normal_idx)
    np.random.shuffle(abnormal_idx)
    labels = tf.cast(labels, dtype=tf.bool)
    return labels, normal_idx, abnormal_idx

#训练集60%正常、验证集15%正常、测试集25%正常，验证集测试集中的正常异常样本1:1
def one_class_masking(args,data,labels,normal_idx,abnormal_idx):
	train_mask=np.zeros(labels.shape,dtype='bool')
	val_mask=np.zeros(labels.shape,dtype='bool')
	test_mask=np.zeros(labels.shape,dtype='bool')
	
	if args.dataset=='reddit':
		train_mask=np.logical_and(data.train_mask,~labels)
		val_mask=masking_reddit(data.val_mask,labels)
		test_mask=masking_reddit(data.test_mask,labels)
	else:
		train_mask[normal_idx[:int(0.6*normal_idx.shape[0])]]=1

		val_mask[normal_idx[int(0.6*normal_idx.shape[0]):int(0.75*normal_idx.shape[0])]]=1
		val_mask[abnormal_idx[:int(0.15*normal_idx.shape[0])]]=1

		test_mask[normal_idx[int(0.75*normal_idx.shape[0]):]]=1
		test_mask[abnormal_idx[-int(0.25*normal_idx.shape[0]):]]=1

	return labels,train_mask,val_mask,test_mask  

def masking_reddit(mask,labels):

	normal=np.logical_and(~labels,mask)
	abnormal=np.logical_and(labels,mask)
	idx=np.where(abnormal==1)[0]
	np.random.shuffle(idx)
	abnormal[idx[:-normal.sum()]]=0
	mask=np.logical_or(normal,abnormal)
	return mask




'''
全部节点都用于测试集分类
def one_class_masking(labels,normal_idx,abnormal_idx):
    train_mask=np.zeros(labels.shape)
    val_mask=np.zeros(labels.shape)
    test_mask=np.zeros(labels.shape)

    train_mask[normal_idx[:int(0.7*normal_idx.shape[0])]]=1

    val_mask[normal_idx[int(0.7*normal_idx.shape[0]):int(0.8*normal_idx.shape[0])]]=1
    val_mask[abnormal_idx[:int(0.3*abnormal_idx.shape[0])]]=1

    test_mask[normal_idx[int(0.8*normal_idx.shape[0]):]]=1
    test_mask[abnormal_idx[int(0.3*abnormal_idx.shape[0]):]]=1

    return labels,train_mask,val_mask,test_mask  
'''

In [None]:
from dgl.data import load_data, tu
from dgl import DGLGraph, transforms
import torch
import torch.utils.data
import numpy as np
import torch
import dgl
import networkx as nx


# add self loop for TU dataset, other datasets haven't been tested. 
def pre_process(args,dataset):

    for i in range(len(dataset)):
        #print(dataset.graph_lists[i])
        #make labels become 0 or 1, other label is not our need.
        #dataset.graph_lists[i].ndata
        normal_idx=torch.where(dataset.graph_lists[i].ndata['node_labels']==args.normal_class)[0]
        abnormal_idx=torch.where(dataset.graph_lists[i].ndata['node_labels']!=args.normal_class)[0]
        dataset.graph_lists[i].ndata['node_labels'][normal_idx]=0
        dataset.graph_lists[i].ndata['node_labels'][abnormal_idx]=1
        

        if args.self_loop:
            g=dgl.transform.add_self_loop(dataset.graph_lists[i])
            g.ndata.update(dataset.graph_lists[i].ndata)
            dataset.graph_lists[i]=g

        #print(dataset.graph_lists[i].ndata['node_labels'].max())
        #print(dataset.graph_lists[i])
    return dataset

def loader(args):
    #if args.dataset == 'PROTEINS_full':
    dataset = tu.TUDataset(name=args.dataset)
    
    train_size = int(0.6 * len(dataset))
    #train_size=16
    test_size = int(0.25 * len(dataset))
    val_size = int(len(dataset) - train_size - test_size)
    
    dataset = pre_process(args,dataset)

    dataset_train, dataset_val, dataset_test = torch.utils.data.random_split(
        dataset, (train_size, val_size, test_size))
    train_loader = prepare_dataloader(dataset_train, args, train=True)
    val_loader = prepare_dataloader(dataset_val, args, train=False)
    test_loader = prepare_dataloader(dataset_test, args, train=False)

    input_dim,label_dim, max_num_node = dataset.statistics() #I rewrited the code of dgl.tu
    print("++++++++++STATISTICS ABOUT THE DATASET")
    print("dataset feature dimension is", input_dim)
    print("dataset label dimension is", label_dim)
    print("the max num node is", max_num_node)
    print("number of graphs is", len(dataset))


    return train_loader, val_loader, test_loader, input_dim, label_dim


def prepare_dataloader(dataset, args, train=False, pre_process=None):
    '''
    preprocess TU dataset according to DiffPool's paper setting and load dataset into dataloader
    '''
    if train:
        shuffle = True
        drop_last = True
    else:
        shuffle = False
        drop_last = False
    if pre_process:
        pre_process(dataset, args)

    # dataset.set_fold(fold)
    return torch.utils.data.DataLoader(dataset,
                                       batch_size=args.batch_size,
                                       shuffle=shuffle,
                                       collate_fn=batching_graph,
                                       drop_last=drop_last,
                                       num_workers=args.n_worker)

def batching_graph(batch):
    '''
    for dataset batching
    transform ndata to tensor (in gpu is available)
    '''
    graphs, labels = map(list, zip(*batch))
    #cuda = torch.cuda.is_available()

    # batch graphs and cast to PyTorch tensor
    for graph in graphs:
        for (key, value) in graph.ndata.items():
            graph.ndata[key] = value.float()
    batched_graphs = dgl.batch(graphs)

    # cast to PyTorch tensor
    batched_labels = torch.LongTensor(np.array(labels))

    return batched_graphs, batched_labels

In [None]:
from .evaluate import fixed_graph_evaluate, multi_graph_evaluate
from .evaluate import thresholding,baseline_evaluate

In [None]:
from sklearn.metrics import f1_score, accuracy_score,precision_score,recall_score,average_precision_score,roc_auc_score,roc_curve
import torch
from optim.loss import loss_function,anomaly_score
import numpy as np
import torch.nn as nn
import time

def fixed_graph_evaluate(args,path,model, data_center,data,radius,mask):

    model.eval()
    with torch.no_grad():
        labels = data['labels'][mask]
        loss_mask=mask.bool() & data['labels'].bool()

        #test_t0 = time.time()
        outputs= model(data['g'],data['features'])  


        #print(loss_mask.)
        _,scores=anomaly_score(data_center,outputs,radius,mask)
        #test_dur = time.time()-test_t0
        loss,_,_=loss_function(args.nu,data_center,outputs,radius,loss_mask)
        #print("Test Time {:.4f}".format(test_dur))
 
        labels=labels.cpu().numpy()
        #dist=dist.cpu().numpy()
        scores=scores.cpu().numpy()

        threshold=0
        pred=thresholding(scores,threshold)

        auc=roc_auc_score(labels, scores)
        ap=average_precision_score(labels, scores)

        acc=accuracy_score(labels,pred)
        recall=recall_score(labels,pred)
        precision=precision_score(labels,pred)
        f1=f1_score(labels,pred)

        return auc,ap,f1,acc,precision,recall,loss

def multi_graph_evaluate(args,path, model, data_center,dataloader,radius,mode='val'):
    '''
    evaluate function
    '''
    if mode=='test':
        print(f'model loaded.')
        model.load_state_dict(torch.load(path))
    model.eval()
    total_loss=0
    # pred_list=[]
    # labels_list=[]
    # scores_list=[]
    #correct_label = 0
    with torch.no_grad():
        for batch_idx, (batch_graph, graph_labels) in enumerate(dataloader):
            if torch.cuda.is_available():
                for (key, value) in batch_graph.ndata.items():
                    batch_graph.ndata[key] = value.cuda()
                #graph_labels = graph_labels.cuda()

            # normlizing = nn.InstanceNorm1d(batch_graph.ndata['node_attr'].shape[1], affine=False).cuda()
            # input_attr=normlizing(batch_graph.ndata['node_attr'].unsqueeze(1)).squeeze()

            normlizing = nn.BatchNorm1d(batch_graph.ndata['node_attr'].shape[1], affine=False).cuda()
            input_attr=normlizing(batch_graph.ndata['node_attr'])

            outputs = model(batch_graph,input_attr)

            labels = batch_graph.ndata['node_labels']
            #print(labels.size())
            loss_mask=~labels.bool().squeeze()
            #print(loss_mask.size())
            _,scores=anomaly_score(data_center,outputs,radius,mask=None)
            #print(outputs[loss_mask].size())
            loss,_,_=loss_function(args.nu,data_center,outputs,radius,loss_mask)

            # loss,_,scores=loss_function(args.nu,data_center,outputs,radius,mask=None)
            labels=labels.cpu().numpy().astype('int8')
            #dist=dist.cpu().numpy()
            scores=scores.cpu().numpy()
            pred=thresholding(scores,0)
            #print('pred',pred[:30])
            # print(labels[:10])
            # print(scores[:10])

            total_loss+=loss
            if batch_idx==0:
                labels_vec=labels
                pred_vec=pred
                scores_vec=scores
            else:
                pred_vec=np.append(pred_vec,pred)
                labels_vec=np.concatenate((labels_vec,labels),axis=0)
                scores_vec=np.concatenate((scores_vec,scores),axis=0)

        total_loss/=(batch_idx+1)
        print('score std',scores_vec.std())
        print('score mean',scores_vec.mean())
        print('labels mean',labels_vec.mean())
        print('pred mean',pred_vec.mean())
        auc=roc_auc_score(labels_vec, scores_vec)
        ap=average_precision_score(labels_vec, scores_vec)

        acc=accuracy_score(labels_vec,pred_vec)
        recall=recall_score(labels_vec,pred_vec)
        precision=precision_score(labels_vec,pred_vec)
        f1=f1_score(labels_vec,pred_vec)

    return auc,ap,f1,acc,precision,recall,total_loss


def thresholding(recon_error,threshold):
    ano_pred=np.zeros(recon_error.shape[0])
    for i in range(recon_error.shape[0]):
        if recon_error[i]>threshold:
            ano_pred[i]=1
    return ano_pred

def baseline_evaluate(datadict,y_pred,y_score,val=True):
    
    if val==True:
        mask=datadict['val_mask']
    if val==False:
        mask=datadict['test_mask']

    auc=roc_auc_score(datadict['labels'][mask],y_score)
    ap=average_precision_score(datadict['labels'][mask],y_score)
    acc=accuracy_score(datadict['labels'][mask],y_pred)
    recall=recall_score(datadict['labels'][mask],y_pred)
    precision=precision_score(datadict['labels'][mask],y_pred)
    f1=f1_score(datadict['labels'][mask],y_pred)

    return auc,ap,f1,acc,precision,recall



In [None]:
from sklearn.metrics import roc_curve,auc,average_precision_score,precision_recall_curve
import matplotlib.pyplot as plt

def plot_ROC(y_test, recon_error_test):    
    fpr, tpr, _ = roc_curve(y_test, recon_error_test)
    roc_auc = auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.001, 1])
    plt.ylim([0, 1.001])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig('ROC',dpi=1200)
    plt.show()

def plot_PRC(y_test, recon_error_test):
    average_precision = average_precision_score(y_test, recon_error_test)

    precision,recall,_ = precision_recall_curve(y_test, recon_error_test)

    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
                     color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve: AP={0:0.4f}'.format(
              average_precision))
    plt.savefig('PRC',dpi=1200)
    plt.show()

In [None]:
import numpy as np


def create_alias_table(area_ratio):
    """

    :param area_ratio: sum(area_ratio)=1
    :return: accept,alias
    """
    l = len(area_ratio)
    accept, alias = [0] * l, [0] * l
    small, large = [], []
    area_ratio_ = np.array(area_ratio) * l
    for i, prob in enumerate(area_ratio_):
        if prob < 1.0:
            small.append(i)
        else:
            large.append(i)

    while small and large:
        small_idx, large_idx = small.pop(), large.pop()
        accept[small_idx] = area_ratio_[small_idx]
        alias[small_idx] = large_idx
        area_ratio_[large_idx] = area_ratio_[large_idx] - \
            (1 - area_ratio_[small_idx])
        if area_ratio_[large_idx] < 1.0:
            small.append(large_idx)
        else:
            large.append(large_idx)

    while large:
        large_idx = large.pop()
        accept[large_idx] = 1
    while small:
        small_idx = small.pop()
        accept[small_idx] = 1

    return accept, alias


def alias_sample(accept, alias):
    """

    :param accept:
    :param alias:
    :return: sample index
    """
    N = len(accept)
    i = int(np.random.random()*N)
    r = np.random.random()
    if r < accept[i]:
        return i
    else:
        return alias[i]


In [None]:
from __future__ import print_function


import numpy
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer


class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return numpy.asarray(all_labels)


class Classifier(object):

    def __init__(self, embeddings, clf):
        self.embeddings = embeddings
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        results['acc'] = accuracy_score(Y,Y_)
        print('-------------------')
        print(results)
        return results
        print('-------------------')

    def predict(self, X, top_k_list):
        X_ = numpy.asarray([self.embeddings[x] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = numpy.random.get_state()

        training_size = int(train_precent * len(X))
        numpy.random.seed(seed)
        shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y)
        numpy.random.set_state(state)
        return self.evaluate(X_test, Y_test)


def read_node_label(filename, skip_head=False):
    fin = open(filename, 'r')
    X = []
    Y = []
    while 1:
        if skip_head:
            fin.readline()
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        X.append(vec[0])
        Y.append(vec[1:])
    fin.close()
    return X, Y


In [None]:
def preprocess_nxgraph(graph):
    node2idx = {}
    idx2node = []
    node_size = 0
    for node in graph.nodes():
        node2idx[node] = node_size
        idx2node.append(node)
        node_size += 1
    return idx2node, node2idx


def partition_dict(vertices, workers):
    batch_size = (len(vertices) - 1) // workers + 1
    part_list = []
    part = []
    count = 0
    for v1, nbs in vertices.items():
        part.append((v1, nbs))
        count += 1
        if count % batch_size == 0:
            part_list.append(part)
            part = []
    if len(part) > 0:
        part_list.append(part)
    return part_list


def partition_list(vertices, workers):
    batch_size = (len(vertices) - 1) // workers + 1
    part_list = []
    part = []
    count = 0
    for v1, nbs in enumerate(vertices):
        part.append((v1, nbs))
        count += 1
        if count % batch_size == 0:
            part_list.append(part)
            part = []
    if len(part) > 0:
        part_list.append(part)
    return part_list


def partition_num(num, workers):
    if num % workers == 0:
        return [num//workers]*workers
    else:
        return [num//workers]*workers + [num % workers]


In [None]:
import itertools
import math
import random

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import trange

from .alias import alias_sample, create_alias_table
from .utils import partition_num


class RandomWalker:
    def __init__(self, G, p=1, q=1):
        """
        :param G:
        :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
        :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
        """
        self.G = G
        self.p = p
        self.q = q

    def deepwalk_walk(self, walk_length, start_node):

        walk = [start_node]

        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(self.G.neighbors(cur))
            if len(cur_nbrs) > 0:
                walk.append(random.choice(cur_nbrs))
            else:
                break
        return walk

    def node2vec_walk(self, walk_length, start_node):

        G = self.G
        alias_nodes = self.alias_nodes
        alias_edges = self.alias_edges

        walk = [start_node]

        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(G.neighbors(cur))
            if len(cur_nbrs) > 0:
                if len(walk) == 1:
                    walk.append(
                        cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])])
                else:
                    prev = walk[-2]
                    edge = (prev, cur)
                    next_node = cur_nbrs[alias_sample(alias_edges[edge][0],
                                                      alias_edges[edge][1])]
                    walk.append(next_node)
            else:
                break

        return walk

    def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):

        G = self.G

        nodes = list(G.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))

        return walks

    def _simulate_walks(self, nodes, num_walks, walk_length,):
        walks = []
        for _ in range(num_walks):
            random.shuffle(nodes)
            for v in nodes:
                if self.p == 1 and self.q == 1:
                    walks.append(self.deepwalk_walk(
                        walk_length=walk_length, start_node=v))
                else:
                    walks.append(self.node2vec_walk(
                        walk_length=walk_length, start_node=v))
        return walks

    def get_alias_edge(self, t, v):
        """
        compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t.
        :param t:
        :param v:
        :return:
        """
        G = self.G
        p = self.p
        q = self.q

        unnormalized_probs = []
        for x in G.neighbors(v):
            weight = G[v][x].get('weight', 1.0)  # w_vx
            if x == t:  # d_tx == 0
                unnormalized_probs.append(weight/p)
            elif G.has_edge(x, t):  # d_tx == 1
                unnormalized_probs.append(weight)
            else:  # d_tx > 1
                unnormalized_probs.append(weight/q)
        norm_const = sum(unnormalized_probs)
        normalized_probs = [
            float(u_prob)/norm_const for u_prob in unnormalized_probs]

        return create_alias_table(normalized_probs)

    def preprocess_transition_probs(self):
        """
        Preprocessing of transition probabilities for guiding the random walks.
        """
        G = self.G

        alias_nodes = {}
        for node in G.nodes():
            unnormalized_probs = [G[node][nbr].get('weight', 1.0)
                                  for nbr in G.neighbors(node)]
            norm_const = sum(unnormalized_probs)
            normalized_probs = [
                float(u_prob)/norm_const for u_prob in unnormalized_probs]
            alias_nodes[node] = create_alias_table(normalized_probs)

        alias_edges = {}

        for edge in G.edges():
            alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])

        self.alias_nodes = alias_nodes
        self.alias_edges = alias_edges

        return


class BiasedWalker:
    def __init__(self, idx2node, temp_path):

        self.idx2node = idx2node
        self.idx = list(range(len(self.idx2node)))
        self.temp_path = temp_path
        pass

    def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):

        layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
        layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
        layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
        gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
        walks = []
        initialLayer = 0

        nodes = self.idx  # list(self.g.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))
        return walks

    def _simulate_walks(self, nodes, num_walks, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma):
        walks = []
        for _ in range(num_walks):
            random.shuffle(nodes)
            for v in nodes:
                walks.append(self._exec_random_walk(layers_adj, layers_accept, layers_alias,
                                                    v, walk_length, gamma, stay_prob))
        return walks

    def _exec_random_walk(self, graphs, layers_accept, layers_alias, v, walk_length, gamma, stay_prob=0.3):
        initialLayer = 0
        layer = initialLayer

        path = []
        path.append(self.idx2node[v])

        while len(path) < walk_length:
            r = random.random()
            if(r < stay_prob):  # same layer
                v = chooseNeighbor(v, graphs, layers_alias,
                                   layers_accept, layer)
                path.append(self.idx2node[v])
            else:  # different layer
                r = random.random()
                try:
                    x = math.log(gamma[layer][v] + math.e)
                    p_moveup = (x / (x + 1))
                except:
                    print(layer, v)
                    raise ValueError()

                if(r > p_moveup):
                    if(layer > initialLayer):
                        layer = layer - 1
                else:
                    if((layer + 1) in graphs and v in graphs[layer + 1]):
                        layer = layer + 1

        return path


def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):

    v_list = graphs[layer][v]

    idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
    v = v_list[idx]

    return v
