## Dataset

In [1]:
import torch
import dgl
import datetime
import pandas as pd

from dgl.data.utils import makedirs, save_info, load_info
from dgl.sampling import node2vec_random_walk
from sklearn.model_selection import train_test_split
from collections import Counter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:

# Load graph and prepare
dataset, _ = dgl.load_graphs('../dataset/tfinance')
graph = dataset[0]

if len(graph.ndata['label'].shape) > 1:
    graph.ndata['label'] = graph.ndata['label'].argmax(1)
    graph.ndata['label'] = graph.ndata['label'].long().squeeze(-1)
graph.ndata['feature'] = graph.ndata['feature'].float()

print(f'Graph have {graph.num_nodes()} nodes, {graph.num_edges()} edges')

# SAMPLING
# Randomly get stratified seed nodes
labels = graph.ndata['label']
index = list(range(len(labels)))

idx_sampled, idx_, labels_sampled, labels_ = train_test_split(
    index, labels[index], stratify=labels[index],
    train_size = 0.03, random_state = 777
)

# Random walk from seed
node_trace, edge_trace = node2vec_random_walk(graph, idx_sampled, 1, 1, 10, return_eids=True)
sampled_nodes = list(set(torch.flatten(node_trace).tolist()))
sampled_edges = list(set(torch.flatten(edge_trace).tolist()))

# Get subgraph
graph = dgl.node_subgraph(graph, sampled_nodes)

print(f'Graph have {graph.num_nodes()} nodes, {graph.num_edges()} edges')

Graph have 39357 nodes, 42484443 edges
Graph have 9454 nodes, 15244782 edges


## Model

In [4]:
# Append path
import sys
sys.path.append('../src')

import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from torch.optim import Adam
from torch.functional import F

import utils.utils_func
import experiment_multiround.supervised_multi
import models.benchmarks_supervised.simple
import models.benchmarks_supervised.spectral
import models.benchmarks_supervised.h2fd
import models.benchmarks_supervised.booster

import importlib
importlib.reload(utils.utils_func)
importlib.reload(experiment_multiround.supervised_multi)
importlib.reload(models.benchmarks_supervised.simple)
importlib.reload(models.benchmarks_supervised.spectral)
importlib.reload(models.benchmarks_supervised.h2fd)

from experiment_multiround.supervised_multi import MultiroundExperiment
from models.benchmarks_supervised.simple import GCN
from models.benchmarks_supervised.spectral import BWGNN
from models.benchmarks_supervised.h2fd import H2FD
from models.benchmarks_supervised.booster import GIN_noparam


In [6]:
model_config = {
    'verbose': 1,
    'model_name': 'XGB',
    'dropout_rate': 0.1,
    'act_name': "ReLU",
    'h_feats': 8,
    'num_layers': 2,
    'mlp_h_feats': 8,
    'mlp_num_layers': 2,
    'att_heads': 2,
    'boost_agg_backbone': GIN_noparam,
    'boost_predictor': xgb.XGBClassifier,
    'boost_metric': average_precision_score,
    'proto_num': 4
}

train_config = {
    'verbose': 3,
    'random_state': 777,
    'train_mode': 'full',
    'batch_size': 128,
    'num_workers': 0,
    'num_epoch': 100,
    'num_round_epoch': 50,
    'initial_ratio': 0.20,
    'train_ratio': 0.90,
    'optimizer': Adam,
    'learning_rate': 0.01,
    'loss': F.cross_entropy,
    'round_num': 1,
    'round_new_pos': 20,
    'round_new_neg': 400,
    'round_budget_pos': 10,
    'round_budget_neg': 200,
    'round_full_training': True,
    'round_reset_model': False
  }

adver_config = {
    'verbose': 1,
    'adver_name': 'REPLAY',
    'feat_coef': 3,
    'conn_coef': 0.3,
    'greedy_seed': True
}

# XGB with and without backbone

In [7]:
import re

ts = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

outer_dfs = []

print(adver_config)

LIST_DSET = ['weibo', 'yelp', 'tsocial']
LIST_MODEL = ['XGB']
LIST_GREEDY = [True]
LIST_FCOEF = [0]
LIST_CCOEF = [0]
# LIST_OTHERS = [1, 3, 5, 9]
LIST_OTHERS = [None, GIN_noparam]
TRIAL_NUM = 5

for dset in LIST_DSET:

  ### DATASET STUFF
  dataset, _ = dgl.load_graphs(f'../dataset/{dset}')
  graph = dataset[0].long()

  if len(graph.ndata['label'].shape) > 1:
      graph.ndata['label'] = graph.ndata['label'].argmax(1)
      graph.ndata['label'] = graph.ndata['label'].long().squeeze(-1)
  graph.ndata['feature'] = graph.ndata['feature'].float()
  train_config['dset_name'] = dset

  ### ADJUST BUDGETS
  pos = (graph.ndata['label'] == 1).sum().item()
  neg = (graph.ndata['label'] == 0).sum().item()
  tot = pos + neg

  train_config['initial_ratio'] = 0.40
  train_config['train_ratio'] = 0.80
  
  train_config['round_num'] = 20
  train_config['round_new_pos'] = int(0.1 * pos)
  train_config['round_new_neg'] = int(0.1 * neg)
  train_config['round_budget_pos'] = 0
  train_config['round_budget_neg'] = 0

  for model in LIST_MODEL:
    model_config['model_name'] = model
    for greedy in LIST_GREEDY:
      adver_config['greedy_seed'] = greedy    
      for fcoef in LIST_FCOEF:
        adver_config['feat_coef'] = fcoef
        for ccoef in LIST_CCOEF:
          adver_config['conn_coef'] = ccoef
          for other_val in LIST_OTHERS:
            model_config['boost_agg_backbone'] = other_val
            
            print("++++++++")
            print("STARTING")
            print(model, greedy, fcoef, ccoef, str(other_val))
            print("++++++++")

            dfs = []
            for i in range(TRIAL_NUM):
              exp = MultiroundExperiment(model_config, adver_config, train_config, graph)
              for j in range(train_config['round_num']):
                exp.adver_round(j)

              log_df = pd.DataFrame(sum([r['log_eval'] for r in exp.rounds], []), columns=['round', 'eval_type', 'time', 'rec', 'prec', 'f1', 'auc', 'tn', 'fp', 'fn', 'tp'])
              log_df['trial'] = i
              
              dfs.append(log_df)

            final_df = pd.concat(dfs)

            for key, value in model_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in train_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in adver_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            final_df['timestamp'] = ts
            stripped = re.sub(r"\W+", "", str(other_val))[:5]
            final_df.to_csv(f'../result/{ts}-INNER-{adver_config["adver_name"]}-{dset}-{model}-{greedy}-{fcoef}-{ccoef}-{stripped}.csv')
            outer_dfs.append(final_df)

final_outer_df = pd.concat(outer_dfs)
final_outer_df.to_csv(f'../result/{ts}.csv')

{'verbose': 1, 'adver_name': 'REPLAY', 'feat_coef': 3, 'conn_coef': 0.3, 'greedy_seed': True}
++++++++
STARTING
XGB True 0 0 None
++++++++

STARTING ROUND 0!
Alotting train-val-test split for round 0
Updated cross-entropy weight to 20.915313225058004
Training set: 18891 ({0: 18029, 1: 862}) rows | Validation set:  4723 ({0: 4508, 1: 215}) rows

Starting training...
[0]	validation_0-logloss:0.50614	validation_0-average_precision_score:0.73732
[1]	validation_0-logloss:0.39817	validation_0-average_precision_score:0.76268
[2]	validation_0-logloss:0.32851	validation_0-average_precision_score:0.77488
[3]	validation_0-logloss:0.27959	validation_0-average_precision_score:0.78181
[4]	validation_0-logloss:0.24781	validation_0-average_precision_score:0.78637
[5]	validation_0-logloss:0.22519	validation_0-average_precision_score:0.78827
[6]	validation_0-logloss:0.20830	validation_0-average_precision_score:0.78701
[7]	validation_0-logloss:0.19647	validation_0-average_precision_score:0.78539
[8]	vali

# Proto

In [9]:
import re

ts = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

outer_dfs = []

print(adver_config)

LIST_DSET = ['tfinance']
LIST_MODEL = ['PROP-PROTO']
LIST_GREEDY = [True]
LIST_FCOEF = [0]
LIST_CCOEF = [0]
LIST_OTHERS = [16, 20]
#LIST_OTHERS = [GIN_noparam]
TRIAL_NUM = 5

for dset in LIST_DSET:

  ### DATASET STUFF
  dataset, _ = dgl.load_graphs(f'../dataset/{dset}')
  graph = dataset[0].long()

  if len(graph.ndata['label'].shape) > 1:
      graph.ndata['label'] = graph.ndata['label'].argmax(1)
      graph.ndata['label'] = graph.ndata['label'].long().squeeze(-1)
  graph.ndata['feature'] = graph.ndata['feature'].float()
  train_config['dset_name'] = dset

  ### ADJUST BUDGETS
  pos = (graph.ndata['label'] == 1).sum().item()
  neg = (graph.ndata['label'] == 0).sum().item()
  tot = pos + neg

  train_config['initial_ratio'] = 0.50
  train_config['train_ratio'] = 0.80
  
  train_config['round_num'] = 10
  train_config['round_new_pos'] = int(0.1 * pos)
  train_config['round_new_neg'] = int(0.1 * neg)
  train_config['round_budget_pos'] = int(0.05 * pos)
  train_config['round_budget_neg'] = int(0.05 * neg)

  for model in LIST_MODEL:
    model_config['model_name'] = model
    for greedy in LIST_GREEDY:
      adver_config['greedy_seed'] = greedy    
      for fcoef in LIST_FCOEF:
        adver_config['feat_coef'] = fcoef
        for ccoef in LIST_CCOEF:
          adver_config['conn_coef'] = ccoef
          for other_val in LIST_OTHERS:
            model_config['proto_num'] = other_val
            
            print("++++++++")
            print("STARTING")
            print(model, greedy, fcoef, ccoef, str(other_val))
            print("++++++++")

            dfs = []
            for i in range(TRIAL_NUM):
              exp = MultiroundExperiment(model_config, adver_config, train_config, graph)
              for j in range(train_config['round_num']):
                exp.adver_round(j)

              log_df = pd.DataFrame(sum([r['log_eval'] for r in exp.rounds], []), columns=['round', 'eval_type', 'time', 'rec', 'prec', 'f1', 'auc', 'tn', 'fp', 'fn', 'tp'])
              log_df['trial'] = i
              
              dfs.append(log_df)

            final_df = pd.concat(dfs)

            for key, value in model_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in train_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in adver_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            final_df['timestamp'] = ts
            stripped = re.sub(r"\W+", "", str(other_val))[:5]
            final_df.to_csv(f'../result/{ts}-INNER-{adver_config["adver_name"]}-{dset}-{model}-{greedy}-{fcoef}-{ccoef}-{stripped}.csv')
            outer_dfs.append(final_df)

final_outer_df = pd.concat(outer_dfs)
final_outer_df.to_csv(f'../result/{ts}.csv')

{'verbose': 1, 'adver_name': 'REPLAY', 'feat_coef': 0, 'conn_coef': 0, 'greedy_seed': True, 'train_mode': 'full'}
++++++++
STARTING
PROP-PROTO True 0 0 16
++++++++

STARTING ROUND 0!
Alotting train-val-test split for round 0
Updated cross-entropy weight to 21.079943899018232
Training set: 15743 ({0: 15030, 1: 713}) rows | Validation set:  3936 ({0: 3758, 1: 178}) rows

Starting training...
Epoch 0, train loss: 0.6332, val loss: 0.6380 val mf1: 0.5785, (best 0.5785)
Epoch 1, train loss: 0.6336, val loss: 0.6388 val mf1: 0.5578, (best 0.5785)
Epoch 2, train loss: 0.6300, val loss: 0.6355 val mf1: 0.5708, (best 0.5785)
Epoch 3, train loss: 0.6302, val loss: 0.6353 val mf1: 0.5814, (best 0.5814)
Epoch 4, train loss: 0.6277, val loss: 0.6351 val mf1: 0.5980, (best 0.5980)
Epoch 5, train loss: 0.6254, val loss: 0.6320 val mf1: 0.6214, (best 0.6214)
Epoch 6, train loss: 0.6258, val loss: 0.6350 val mf1: 0.6074, (best 0.6214)
Epoch 7, train loss: 0.6237, val loss: 0.6307 val mf1: 0.6182, (best

# Proto vs others

In [8]:
import re

ts = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

outer_dfs = []

print(adver_config)

LIST_DSET = ['tfinance']
LIST_MODEL = ['PROP-PROTO', 'GCN', 'XGB', 'BWGNN']
LIST_GREEDY = [True]
LIST_FCOEF = [0]
LIST_CCOEF = [0]
LIST_OTHERS = [4]
#LIST_OTHERS = [GIN_noparam]
TRIAL_NUM = 5

for dset in LIST_DSET:

  ### DATASET STUFF
  dataset, _ = dgl.load_graphs(f'../dataset/{dset}')
  graph = dataset[0].long()

  if len(graph.ndata['label'].shape) > 1:
      graph.ndata['label'] = graph.ndata['label'].argmax(1)
      graph.ndata['label'] = graph.ndata['label'].long().squeeze(-1)
  graph.ndata['feature'] = graph.ndata['feature'].float()
  train_config['dset_name'] = dset

  ### ADJUST BUDGETS
  pos = (graph.ndata['label'] == 1).sum().item()
  neg = (graph.ndata['label'] == 0).sum().item()
  tot = pos + neg

  train_config['initial_ratio'] = 0.50
  train_config['train_ratio'] = 0.80
  
  train_config['round_num'] = 5
  train_config['round_new_pos'] = int(0.1 * pos)
  train_config['round_new_neg'] = int(0.1 * neg)
  train_config['round_budget_pos'] = int(0.05 * pos)
  train_config['round_budget_neg'] = int(0.05 * neg)

  for model in LIST_MODEL:
    model_config['model_name'] = model
    for greedy in LIST_GREEDY:
      adver_config['greedy_seed'] = greedy    
      for fcoef in LIST_FCOEF:
        adver_config['feat_coef'] = fcoef
        for ccoef in LIST_CCOEF:
          adver_config['conn_coef'] = ccoef
          for other_val in LIST_OTHERS:
            model_config['proto_num'] = other_val
            
            print("++++++++")
            print("STARTING")
            print(model, greedy, fcoef, ccoef, str(other_val))
            print("++++++++")

            dfs = []
            for i in range(TRIAL_NUM):
              exp = MultiroundExperiment(model_config, adver_config, train_config, graph)
              for j in range(train_config['round_num']):
                exp.adver_round(j)

              log_df = pd.DataFrame(sum([r['log_eval'] for r in exp.rounds], []), columns=['round', 'eval_type', 'time', 'rec', 'prec', 'f1', 'auc', 'tn', 'fp', 'fn', 'tp'])
              log_df['trial'] = i
              
              dfs.append(log_df)

            final_df = pd.concat(dfs)

            for key, value in model_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in train_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in adver_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            final_df['timestamp'] = ts
            stripped = re.sub(r"\W+", "", str(other_val))[:5]
            final_df.to_csv(f'../result/{ts}-INNER-{adver_config["adver_name"]}-{dset}-{model}-{greedy}-{fcoef}-{ccoef}-{stripped}.csv')
            outer_dfs.append(final_df)

final_outer_df = pd.concat(outer_dfs)
final_outer_df.to_csv(f'../result/{ts}.csv')

{'verbose': 1, 'adver_name': 'REPLAY', 'feat_coef': 3, 'conn_coef': 0.3, 'greedy_seed': True}
++++++++
STARTING
PROP-PROTO True 0 0 4
++++++++

STARTING ROUND 0!
Alotting train-val-test split for round 0
Updated cross-entropy weight to 20.44822888283379
Training set: 15743 ({0: 15009, 1: 734}) rows | Validation set:  3936 ({0: 3752, 1: 184}) rows

Starting training...
Epoch 0, train loss: 0.6436, val loss: 0.6405 val mf1: 0.5433, (best 0.5433)
Epoch 1, train loss: 0.6422, val loss: 0.6397 val mf1: 0.5363, (best 0.5433)
Epoch 2, train loss: 0.6417, val loss: 0.6392 val mf1: 0.5280, (best 0.5433)
Epoch 3, train loss: 0.6409, val loss: 0.6389 val mf1: 0.5363, (best 0.5433)
Epoch 4, train loss: 0.6410, val loss: 0.6396 val mf1: 0.5425, (best 0.5433)
Epoch 5, train loss: 0.6404, val loss: 0.6391 val mf1: 0.5425, (best 0.5433)
Epoch 6, train loss: 0.6398, val loss: 0.6392 val mf1: 0.5425, (best 0.5433)
Epoch 7, train loss: 0.6401, val loss: 0.6393 val mf1: 0.5389, (best 0.5433)
Epoch 8, trai

# SPLIT PROTO

In [6]:
import re

ts = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

outer_dfs = []

print(adver_config)

LIST_DSET = ['tfinance']
LIST_MODEL = ['PROP-SPLITPROTO']
LIST_GREEDY = [True]
LIST_FCOEF = [0]
LIST_CCOEF = [0]
LIST_OTHERS = [1, 4, 10]
#LIST_OTHERS = [GIN_noparam]
TRIAL_NUM = 5

for dset in LIST_DSET:

  ### DATASET STUFF
  dataset, _ = dgl.load_graphs(f'../dataset/{dset}')
  graph = dataset[0].long()

  if len(graph.ndata['label'].shape) > 1:
      graph.ndata['label'] = graph.ndata['label'].argmax(1)
      graph.ndata['label'] = graph.ndata['label'].long().squeeze(-1)
  graph.ndata['feature'] = graph.ndata['feature'].float()
  train_config['dset_name'] = dset

  ### ADJUST BUDGETS
  pos = (graph.ndata['label'] == 1).sum().item()
  neg = (graph.ndata['label'] == 0).sum().item()
  tot = pos + neg

  train_config['initial_ratio'] = 0.50
  train_config['train_ratio'] = 0.80
  
  train_config['round_num'] = 1
  train_config['round_new_pos'] = int(0.1 * pos)
  train_config['round_new_neg'] = int(0.1 * neg)
  train_config['round_budget_pos'] = int(0.05 * pos)
  train_config['round_budget_neg'] = int(0.05 * neg)

  for model in LIST_MODEL:
    model_config['model_name'] = model
    for greedy in LIST_GREEDY:
      adver_config['greedy_seed'] = greedy    
      for fcoef in LIST_FCOEF:
        adver_config['feat_coef'] = fcoef
        for ccoef in LIST_CCOEF:
          adver_config['conn_coef'] = ccoef
          for other_val in LIST_OTHERS:
            model_config['proto_num'] = other_val
            
            print("++++++++")
            print("STARTING")
            print(model, greedy, fcoef, ccoef, str(other_val))
            print("++++++++")

            dfs = []
            for i in range(TRIAL_NUM):
              exp = MultiroundExperiment(model_config, adver_config, train_config, graph)
              for j in range(train_config['round_num']):
                exp.adver_round(j)

              log_df = pd.DataFrame(sum([r['log_eval'] for r in exp.rounds], []), columns=['round', 'eval_type', 'time', 'rec', 'prec', 'f1', 'auc', 'tn', 'fp', 'fn', 'tp'])
              log_df['trial'] = i
              
              dfs.append(log_df)

            final_df = pd.concat(dfs)

            for key, value in model_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in train_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            for key, value in adver_config.items():
              if key not in ['verbose']:
                final_df[key] = str(value)

            final_df['timestamp'] = ts
            stripped = re.sub(r"\W+", "", str(other_val))[:5]
            final_df.to_csv(f'../result/{ts}-INNER-{adver_config["adver_name"]}-{dset}-{model}-{greedy}-{fcoef}-{ccoef}-{stripped}.csv')
            outer_dfs.append(final_df)

final_outer_df = pd.concat(outer_dfs)
final_outer_df.to_csv(f'../result/{ts}.csv')

{'verbose': 1, 'adver_name': 'REPLAY', 'feat_coef': 3, 'conn_coef': 0.3, 'greedy_seed': True}
++++++++
STARTING
PROP-SPLITPROTO True 0 0 1
++++++++

STARTING ROUND 0!
Alotting train-val-test split for round 0
Updated cross-entropy weight to 21.33049645390071
Training set: 15743 ({0: 15038, 1: 705}) rows | Validation set:  3936 ({0: 3760, 1: 176}) rows

Starting training...
Epoch 0, train loss: 0.7187, val loss: 0.7174 val mf1: 0.5059, (best 0.5059)
Epoch 1, train loss: 0.7168, val loss: 0.7158 val mf1: 0.5102, (best 0.5102)
Epoch 2, train loss: 0.7149, val loss: 0.7140 val mf1: 0.5180, (best 0.5180)
Epoch 3, train loss: 0.7134, val loss: 0.7126 val mf1: 0.5178, (best 0.5180)
Epoch 4, train loss: 0.7119, val loss: 0.7112 val mf1: 0.5212, (best 0.5212)
Epoch 5, train loss: 0.7108, val loss: 0.7100 val mf1: 0.5248, (best 0.5248)
Epoch 6, train loss: 0.7087, val loss: 0.7080 val mf1: 0.5327, (best 0.5327)
Epoch 7, train loss: 0.7077, val loss: 0.7072 val mf1: 0.5364, (best 0.5364)
Epoch 8,