In [1]:
### run the following script in the terminal
# python -m pip config set global.index-url https://PLV3106:${ARTIFACTORY_KEY}@repo.usaa.com/api/pypi/usaa-pypi-xray/simple
### install the following package
#!pip install --upgrade pip
#!pip install --quiet tqdm==4.33.0 
# !pip install --quiet torch==1.6.0 --index-url https://repo.usaa.com/artifactory/api/pypi/usaa-pypi-eval/simple --trusted-host repo.usaa.com
# !pip install --quiet category-encoders==2.2.2
#!pip install /mnt/dgl_cu102-0.6.0-cp36-cp36m-manylinux1_x86_64.whl
#!pip install catboost==0.25.1
# !pip install --quiet scikit-learn==0.24.2


In [2]:
import argparse
from tqdm import tqdm
tqdm().pandas()
import pandas as pd
import numpy as np
import json
import os
import time
import pickle

# os.environ['OMP_NUM_THREADS']=1
# export OMP_NUM_THREADS=1

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Dropout, ELU, Sequential, Linear, ReLU

import dgl
from dgl import edge_subgraph
import dgl.nn as dglnn
import dgl.function as fn

import bgnn_cpu
import bgnn_gpu
import bgnn_gpu_LGB
import models
import utils

from category_encoders import CatBoostEncoder
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc as auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder, label_binarize

0it [00:00, ?it/s]
Using backend: pytorch


In [3]:
# import sklearn
# print(sklearn.__version__)

In [4]:
def normalize_features(X, train_mask, val_mask, test_mask):
    min_max_scaler = preprocessing.MinMaxScaler()
    A = X.to_numpy(copy=True)
    A[train_mask] = min_max_scaler.fit_transform(A[train_mask])
    A[val_mask + test_mask] = min_max_scaler.transform(A[val_mask + test_mask])
    return pd.DataFrame(A, columns=X.columns).astype(float)

def replace_na(X, train_mask):
    if X.isna().any().any():
        return X.fillna(X.iloc[train_mask].min() - 1)
    return X

def encode_cat_features(X, y, cat_features, train_mask, val_mask, test_mask):
    enc = CatBoostEncoder()
    A = X.to_numpy(copy=True)
    b = y.to_numpy(copy=True)
    A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask])
    A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)])
    A = A.astype(float)
    return pd.DataFrame(A, columns=X.columns)


def lift_gain_eval(logit,label,topk):
    DF=pd.DataFrame(columns=["pred_score","actual_label"])
    DF["pred_score"]=logit
    DF["actual_label"]=label
    DF.sort_values(by="pred_score", ascending=False, inplace=True)
    gain={}
    for p in topk:
        N=math.ceil(int(DF.shape[0]*p))
        DF2=DF.nlargest(N,"pred_score",keep="first")
        gain[str(int(p*100))+"%"]=round(DF2.actual_label.sum()/(DF.actual_label.sum()),2)
    return gain


def get_class_count_weight(y,n_classes):
    classes_count=[]
    weight=[]
    for i in range(n_classes):
        count=np.sum(y.squeeze()==i)
        classes_count.append(count)
        weight.append(len(y)/(n_classes*count))
    return classes_count,weight


def eval_loop_func(model, loader, labels, device, loss_weight, num_classes):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    losses=[]
    for input_nodes_raw, seeds, blocks in tqdm(loader, position=0, leave=True):
        blocks = [blk.to(device) for blk in blocks]
        seeds = seeds.to(device)
        
        input_nodes={}
        input_nodes["usaanr"]=input_nodes_raw
        input_nodes={k : e.to(device) for k, e in input_nodes.items()}

        lbl = labels[seeds].squeeze().to(device)
        
        with th.no_grad():
            logits,h = model(input_nodes,blocks)
            if loss_weight is None:
                loss = F.cross_entropy(logits.view(-1, num_classes), lbl.to(device))
            else:
                loss = F.cross_entropy(logits.view(-1, num_classes), lbl.to(device),weight=loss_weight.float())        
            losses.append(loss.item())
        fin_targets.append(lbl.cpu().detach().numpy())
        fin_outputs.append(logits.cpu().detach().numpy())
    return np.concatenate(fin_outputs), np.concatenate(fin_targets), losses


def evaluate(target, predicted):
    true_label_mask=[1 if (np.argmax(x)-target[i])==0 else 0 for i,x in enumerate(predicted)]
    nb_prediction=len(true_label_mask)
    true_prediction=sum(true_label_mask)
    false_prediction=nb_prediction-true_prediction
    accuracy=true_prediction/nb_prediction
    
    precision, recall, fscore, support = precision_recall_fscore_support(target, predicted.argmax(axis=1))
    auc = roc_auc_score(target.ravel(), th.sigmoid(th.from_numpy(predicted))[:,1].numpy().ravel())
    
    prec,rec,_ = precision_recall_curve(target.ravel(), th.sigmoid(th.from_numpy(predicted))[:,1].numpy().ravel())
    
    pr_auc=auc_score(rec,prec)
    
    arg1=predicted[:,1]
    arg2=target
    gain = lift_gain_eval(arg1,arg2,topk=[0.01,0.05,0.10])
    
    return {
        "nb_example":len(target),
        "true_prediction":true_prediction,
        "false_prediction":false_prediction,
        "accuracy":accuracy,
        "precision":precision[1], 
        "recall":recall[1], 
        "f1_score":fscore[1],
        "AUC":auc,
        "pr_auc":pr_auc,
        "GAIN":gain
    }


In [5]:
KG_dir="/workspace/cjiang/eagle_project/CAP_graph/BGNN/"

start=time.time()
with open(os.path.join(KG_dir,'homo_graph'), 'rb') as f:
    G, binary_label, train_mask,val_mask, test_mask = pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph".format(end-start))

It took 16.0569 seconds to load graph


#### sampling data

In [6]:
G.ndata["binary_label"]=binary_label
G.ndata["train_mask"]=train_mask
G.ndata["val_mask"]=val_mask
G.ndata["test_mask"]=test_mask

dict_edges={}
for etype in G.etypes:
    dict_edges[etype]=torch.arange(G.num_edges(etype))[0:5000]
g=dgl.edge_subgraph(G,dict_edges)

# g=G

binary_label=g.ndata.pop("binary_label")
train_mask=g.ndata.pop("train_mask")
val_mask=g.ndata.pop("val_mask")
test_mask=g.ndata.pop("test_mask")

utils.graph_show(g)

**************************************************
Node_types:  ['_N']
Edge_types:  ['_E']
**************************************************
Canonical Etypes of Graph is:

_N                  _E                  _N                  
**************************************************
number of ntype=_N                    8,359          
**************************************************
Total number of nodes is 8,359
**************************************************
number of etype=_E                    5,000          
**************************************************
Total number of edges is 5,000
**************************************************
**************************************************
The attributes for the node type=_N
**************************************************
usaayr                                  torch.Size([8359, 1])
AGE_BAND                                torch.Size([8359, 1])
ORIGEL                                  torch.Size([8359, 1])
ELIG2            

#### parameters setting

In [7]:
parser = argparse.ArgumentParser(description='BGNN')

parser.add_argument("--task", type=str, default="classification")
parser.add_argument("--h_dim", type=int, default=128,help="number of hidden units")
parser.add_argument("--trees_per_epoch", type=int, default=5)
parser.add_argument("--gbdt_depth", type=int, default=6)

parser.add_argument("--backprop_per_epoch", type=int, default=1)
parser.add_argument("--lr", type=float, default=0.01,help="learning rate for GNN")
parser.add_argument("--gbdt_lr", type=float, default=0.01,help="learning rate for GBDT")
parser.add_argument("--append_gbdt_pred", default=True, action='store_true')
parser.add_argument("--train_input_features", default=True, action='store_true')

parser.add_argument("--dropout", type=float, default=0,
        help="dropout probability")
parser.add_argument("--out_dim", type=int, default=1,
        help="output dimension")
parser.add_argument("--num_bases", type=int, default=-1,
        help="number of filter weight matrices, default: -1 [use all]")
parser.add_argument("--num_layers", type=int, default=1,
        help="number of propagation rounds")
parser.add_argument("-e", "--n_epochs", type=int, default=1,
        help="number of training epochs")
parser.add_argument("--l2norm", type=float, default=0,
        help="l2 norm coef")
parser.add_argument("--gpu", type=int, default=0,help="gpu")
parser.add_argument("--use_self_loop", default=True, action='store_true',
        help="include self feature as a special relation")
parser.add_argument("--batch_size", type=int, default=1024,
        help="Mini-batch size. If -1, use full graph training.")
parser.add_argument("--num_mini_batch", type=int, default=8,
        help="Number of minibatch.")
parser.add_argument("--fanout", type=int, default=None,
        help="Fan-out of neighbor sampling.")
parser.add_argument("--validation",  default=True,
        help="set up validation .")
parser.add_argument("--seed",  type=int,default=101,
        help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")
parser.add_argument("--loss_weight",  type=bool,default=True,  ## number of label=0/number of label=1
        help="weight for unbalance data")

args,unknown=parser.parse_known_args()

args.num_layers=1
args.dropout=0.2
args.lr=1e-3
args.l2norm=1e-3
args.num_bases=5
args.h_dim=64
args.low_mem=True
args.layer_norm=True
args.use_self_loop=True
args.batch_size=1024*10
args.backprop_per_epoch=5
print(args)

Namespace(append_gbdt_pred=True, backprop_per_epoch=5, batch_size=10240, dropout=0.2, fanout=None, gbdt_depth=6, gbdt_lr=0.01, gpu=0, h_dim=64, l2norm=0.001, layer_norm=True, loss_weight=True, low_mem=True, lr=0.001, n_epochs=1, num_bases=5, num_layers=1, num_mini_batch=8, out_dim=1, seed=101, task='classification', train_input_features=True, trees_per_epoch=5, use_self_loop=True, validation=True)


#### Create data loader

In [8]:
train_idx=torch.nonzero(train_mask.squeeze()).squeeze().tolist()
val_idx=torch.nonzero(val_mask.squeeze()).squeeze().tolist()
test_idx=torch.nonzero(test_mask.squeeze()).squeeze().tolist()

print('{:<15} {:<10,}'.format("Training set",len(train_idx)))
print('{:<15} {:<10,}'.format("validation set",len(val_idx)))
print('{:<15} {:<10,}'.format("test set",len(test_idx)))

Training set    6,737     
validation set  804       
test set        818       


	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  """Entry point for launching an IPython kernel.


In [9]:
feat_list=[]
for key, scheme in g.node_attr_schemes().items():
    feat_list.append(key)

In [10]:
X={}
for i in tqdm(range(len(feat_list)),position=0, leave=True):
    col=feat_list[i]
    X[col]=g.ndata.pop(col).squeeze().tolist()
    
X=pd.DataFrame(X)    

X.head(2)

100%|██████████| 19/19 [00:00<00:00, 3956.30it/s]


Unnamed: 0,usaayr,AGE_BAND,ORIGEL,ELIG2,cmpyelig,SEX,MARST,PERSST,DEATHSDT,BRANCH,MILST,MLIST_OrigStat,enl1stsdt,COMMSDT,ENLPAYGD,ACTCORP,STATE,Segment,_ID
0,29,5,0,0,1,1,2,0,326,1,2,2,28712,24917,14,1,32,1,0
1,55,4,26,3,0,2,5,0,326,0,0,2,28712,24917,24,1,9,2,2


In [11]:
y={}
y['class']=binary_label.squeeze().tolist()
y=pd.DataFrame(y)
y['class'].head(2)

0    1
1    1
Name: class, dtype: int64

In [12]:
for col in X.columns:
    X[col]=X[col].astype(str)

In [13]:
encoded_X = X.copy()
normalizeFeatures = False
replaceNa = True

cat_features=np.arange(len(feat_list))

if len(cat_features):
    encoded_X = encode_cat_features(encoded_X, y, cat_features, train_idx, val_idx, test_idx)
if normalizeFeatures:
    encoded_X = normalize_features(encoded_X, train_idx, val_idx, test_idx)
if replaceNa:
    encoded_X = replace_na(encoded_X, train_idx)

In [14]:
encoded_X.head(2)

Unnamed: 0,usaayr,AGE_BAND,ORIGEL,ELIG2,cmpyelig,SEX,MARST,PERSST,DEATHSDT,BRANCH,MILST,MLIST_OrigStat,enl1stsdt,COMMSDT,ENLPAYGD,ACTCORP,STATE,Segment,_ID
0,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232
1,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.428232,0.714116,0.714116,0.428232,0.428232,0.714116,0.714116,0.714116,0.428232,0.714116,0.428232,0.428232,0.428232


In [15]:
num_rels=torch.unique(g.edata['etype']).shape[0]

out_dim=torch.unique(binary_label).shape[0]
in_dim = out_dim + X.shape[1]

dummy_model = models.EntityClassify(g,
                       in_dim,
                       out_dim,
                       num_rels,
                       args.num_bases,
                       args.num_layers,
                       args.dropout,
                       args.use_self_loop,
                       args.low_mem,
                       args.layer_norm)

# dummy_model.to(device)

In [16]:
print("The total # of parameter is {:,}".format(sum([p.nelement() for p in dummy_model.parameters()]) ) )

The total # of parameter is 1,934


In [17]:
param_dict={n: p.nelement() for n, p in dummy_model.named_parameters()}
for i,j in param_dict.items():
    print("{:<70}{:<15,}".format(i,j))

layers.0.weight                                                       441            
layers.0.h_bias                                                       21             
layers.0.loop_weight                                                  441            
layers.0.layer_norm_weight.weight                                     21             
layers.0.layer_norm_weight.bias                                       21             
layers.1.weight                                                       441            
layers.1.h_bias                                                       21             
layers.1.loop_weight                                                  441            
layers.1.layer_norm_weight.weight                                     21             
layers.1.layer_norm_weight.bias                                       21             
classifier.weight                                                     42             
classifier.bias                                       

#### CPU

In [18]:
##### check cuda
device="cpu"
# use_cuda=args.gpu>=0 and torch.cuda.is_available()
# if use_cuda:
#     torch.cuda.set_device(args.gpu)
#     device='cuda:%d' % args.gpu
print(device)
dummy_model.to(device)

cpu


EntityClassify(
  (layers): ModuleList(
    (0): RelGraphConv(
      (layer_norm_weight): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1): RelGraphConv(
      (layer_norm_weight): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (classifier): Linear(in_features=21, out_features=2, bias=True)
)

In [19]:
# initialize BGNN model
bgnn = bgnn_cpu.BGNNPredictor(dummy_model, 
                              device,
                              task='classification',
                              loss_fn=None,
                              trees_per_epoch=args.trees_per_epoch,
                              backprop_per_epoch=args.backprop_per_epoch,
                              lr=args.lr,
                              append_gbdt_pred=args.append_gbdt_pred,
                              train_input_features=args.train_input_features,
                              gbdt_depth=args.gbdt_depth,
                              gbdt_lr=args.gbdt_lr)

In [20]:
# %pdb
start=time.time()
train_metric, val_metric, test_metric = bgnn.fit(g, encoded_X, y, train_idx, val_idx, test_idx,\
                   original_X = X, cat_features=cat_features,\
                   num_epochs=10, patience=5, metric_name='loss',\
                   fanout=args.fanout,num_layers=args.num_layers,batch_size=args.batch_size)

end=time.time()
print("loading time is {:0.4f}".format(end-start))

100%|██████████| 1/1 [00:00<00:00, 16.90it/s]
100%|██████████| 1/1 [00:00<00:00,  6.69it/s]
100%|██████████| 1/1 [00:00<00:00, 44.89it/s]
100%|██████████| 1/1 [00:00<00:00, 27.02it/s]
100%|██████████| 1/1 [00:00<00:00,  8.41it/s]
100%|██████████| 1/1 [00:00<00:00, 112.18it/s]
100%|██████████| 1/1 [00:00<00:00,  7.11it/s]
100%|██████████| 1/1 [00:00<00:00, 15.22it/s]
100%|██████████| 1/1 [00:00<00:00, 15.21it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
100%|██████████| 1/1 [00:00<00:00, 28.05it/s]
100%|██████████| 1/1 [00:00<00:00, 33.74it/s]
100%|██████████| 1/1 [00:00<00:00, 24.34it/s]
100%|██████████| 1/1 [00:00<00:00, 26.94it/s]
100%|██████████| 1/1 [00:00<00:00, 124.11it/s]
100%|██████████| 1/1 [00:00<00:00, 10.32it/s]
100%|██████████| 1/1 [00:00<00:00, 11.97it/s]
100%|██████████| 1/1 [00:00<00:00, 65.55it/s]
100%|██████████| 1/1 [00:00<00:00, 24.95it/s]
100%|██████████| 1/1 [00:00<00:00,  9.46it/s]
100%|██████████| 1/1 [00:00<00:00, 61.39it/s]
100%|██████████| 1/1 [00:00<00:0

Best loss at iteration 9: 0.077
loading time is 24.6961





In [21]:
train_metric

{'loss': 0.08580104261636734,
 'nb_example': 6737,
 'true_prediction': 6720,
 'false_prediction': 17,
 'accuracy': 0.9974766216416803,
 'precision': 0.9944827586206897,
 'recall': 0.9970613656006914,
 'AUC': 0.9999643661219003,
 'pr_auc': 0.99995239625854,
 'GAIN': {'1%': 0.02, '5%': 0.12, '10%': 0.23}}

In [22]:
val_metric

{'loss': 0.07661468535661697,
 'nb_example': 6737,
 'true_prediction': 6722,
 'false_prediction': 15,
 'accuracy': 0.9977734896838355,
 'precision': 0.9948275862068966,
 'recall': 0.9974070872947277,
 'AUC': 0.9999871322106862,
 'pr_auc': 0.9999827845413412,
 'GAIN': {'1%': 0.02, '5%': 0.12, '10%': 0.23}}

In [23]:
test_metric

{'loss': 0.061647929251194,
 'nb_example': 818,
 'true_prediction': 818,
 'false_prediction': 0,
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'AUC': 1.0,
 'pr_auc': 1.0,
 'GAIN': {'1%': 0.02, '5%': 0.11, '10%': 0.22}}

In [24]:
# predicted= bgnn.predict(g, X, test_idx,args.fanout,args.num_layers,args.batch_size)

####  GPU

In [25]:
##### check cuda
device="cpu"
use_cuda=args.gpu>=0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(args.gpu)
    device='cuda:%d' % args.gpu
print(device)
dummy_model.to(device)

cuda:0


EntityClassify(
  (layers): ModuleList(
    (0): RelGraphConv(
      (layer_norm_weight): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1): RelGraphConv(
      (layer_norm_weight): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (classifier): Linear(in_features=21, out_features=2, bias=True)
)

In [26]:
# initialize BGNN model
bgnn = bgnn_gpu_LGB.BGNNPredictor(dummy_model, 
                              device,
                              task='classification',
                              loss_fn=None,
                              trees_per_epoch=args.trees_per_epoch,
                              backprop_per_epoch=args.backprop_per_epoch,
                              lr=args.lr,
                              append_gbdt_pred=args.append_gbdt_pred,
                              train_input_features=args.train_input_features,
                              gbdt_depth=args.gbdt_depth,
                              gbdt_lr=args.gbdt_lr)

In [27]:
# %pdb
start=time.time()
train_metric, val_metric, test_metric = bgnn.fit(g, encoded_X, y, train_idx, val_idx, test_idx,\
                   original_X = X, cat_features=cat_features,\
                   num_epochs=10, patience=5, metric_name='loss',\
                   fanout=args.fanout,num_layers=args.num_layers,batch_size=args.batch_size)

end=time.time()
print("loading time is {:0.4f}".format(end-start))

  return f(*args, **kwargs)


TypeError: can only concatenate list (not "set") to list

In [None]:
train_metric

In [None]:
val_metric

In [None]:
test_metric