In [None]:
# !pip install --quiet --pre dgl-cu101
# !pip install --quiet torch==1.6.0

In [1]:
import argparse
import itertools
import os
import numpy as np
from numpy import save,load,savetxt,loadtxt,savez_compressed
from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc as auc_score
import pandas as pd
import scipy.sparse as sp
import time
from tqdm import tqdm, tqdm_notebook,tnrange
tqdm.pandas(position=0, leave=True)
import math 

import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.nn as dglnn
import dgl.function as fn
from dgl.ops import edge_softmax

from functools import partial
import seaborn as sns
import pickle
import random
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(5.0,4.0)
plt.rcParams['image.interpolation']='nearest'
plt.rcParams['image.cmap']='gray'
import warnings
warnings.filterwarnings('ignore')
import utils
import tsne_func
print("torch version is {}".format(th.__version__))
print("DGL version is {}".format(dgl.__version__))

Using backend: pytorch


torch version is 1.6.0
DGL version is 0.7a210520


In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)
    th.cuda.manual_seed(seed)
    th.cuda.manual_seed_all(seed)
    th.backends.cudnn.deterministic = True
    th.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(101)

In [26]:
data_dir="/workspace/cjiang/eagle_project/CAP_graph/CAP_Update/"

start=time.time()
with open(os.path.join(data_dir,"CAP_Graph_New"),"rb") as f:
    G,multi_label,binary_label,\
    train_mask_multi_label,  val_mask_multi_label,  test_mask_multi_label,\
    train_mask_binary_label, val_mask_binary_label, test_mask_binary_label=pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph".format(end-start))

It took 12.8735 seconds to load graph


In [35]:
start=time.time()
with open(os.path.join(data_dir,"CAP_Graph"),"rb") as f:
    G_orig,multi_label_orig,binary_label_orig,\
    train_mask_multi_label_orig,  val_mask_multi_label_orig,  test_mask_multi_label_orig,\
    train_mask_binary_label_orig, val_mask_binary_label_orig, test_mask_binary_label_orig=pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph".format(end-start))

It took 10.2244 seconds to load graph


In [48]:
th.unique(binary_label,return_counts=True)

(tensor([0, 1]), tensor([19358913,  6309591]))

In [49]:
th.unique(binary_label_orig,return_counts=True)

(tensor([0, 1]), tensor([19358913,  6309591]))

In [47]:
th.sum(th.eq(binary_label,binary_label_orig)), binary_label.shape[0]

(tensor(15939498), 25668504)

In [44]:
th.sum(th.eq(train_mask_binary_label,train_mask_binary_label_orig)),\
th.sum(th.eq(val_mask_binary_label,val_mask_binary_label_orig)),\
th.sum(th.eq(test_mask_binary_label,test_mask_binary_label_orig))

(tensor(17454180), tensor(21049994), tensor(21048350))

In [45]:
th.sum(train_mask_binary_label_orig),th.sum(val_mask_binary_label_orig),th.sum(test_mask_binary_label_orig)

(tensor(20534804), tensor(2566850), tensor(2566850))

In [39]:
assert train_mask_binary_label.squeeze()==train_mask_binary_label_orig.squeeze()

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [4]:
utils.graph_show(G)

**************************************************
Node_types:  ['usaanr']
Edge_types:  ['AUTO_RELATED', 'Brother_Sister', 'Busi_rel_Other', 'Child', 'Ex-Spouse', 'Parent', 'Pers_rel_Other', 'SPONSEE', 'SPONSOR', 'Spouse', 'Step-Child', 'Step-Parent']
**************************************************
Canonical Etypes of Graph is:

usaanr              AUTO_RELATED        usaanr              
usaanr              Brother_Sister      usaanr              
usaanr              Busi_rel_Other      usaanr              
usaanr              Child               usaanr              
usaanr              Ex-Spouse           usaanr              
usaanr              Parent              usaanr              
usaanr              Pers_rel_Other      usaanr              
usaanr              SPONSEE             usaanr              
usaanr              SPONSOR             usaanr              
usaanr              Spouse              usaanr              
usaanr              Step-Child          usaanr          

In [7]:
usaanr_feat=[]
for key, scheme in G.node_attr_schemes(ntype="usaanr").items():
    usaanr_feat.append(key)
usaanr_feat=[x for x in usaanr_feat if x not in ['ZIPCD','AGE','PERSST','DEATHSDT','enl1stsdt','COMMSDT']]

print()
print("The features associated with USAA Member are\n ")
for i in usaanr_feat:
    print(i)


The features associated with USAA Member are
 
usaayr
AGE_BAND
ORIGEL
ELIG2
cmpyelig
SEX
MARST
BRANCH
MILST
MLIST_OrigStat
ENLPAYGD
ACTCORP
STATE
Segment


In [8]:
## USAA Members Features Embedding
class USAANR_Embedding(nn.Module):
    def __init__(self,G,feature_size):
        super(USAANR_Embedding,self).__init__()
        self.G=G.to(device)
        self.feature_size=feature_size
        ## Embedding matrices for features of nodes.
        self.emb = nn.ModuleDict()
        
        for i,col in enumerate(usaanr_feat):
            self.emb[col]=nn.Embedding(G.nodes['usaanr'].data[col].max().item()+1, feature_size)
    
    def forward(self,nid):
        nid=nid.to(device)
        extra_repr=[]
        for i,col in enumerate(usaanr_feat):
            ndata=self.G.nodes['usaanr'].data[col]
            extra_repr.append(self.emb[col](ndata[nid]).squeeze(1))
        return th.stack(extra_repr, 0).sum(0)


class RelGraphConvLayer(nn.Module):
    r"""Relational graph convolution layer.
    Parameters
    ----------
    in_feat : int
        Input feature size.
    out_feat : int
        Output feature size.
    rel_names : list[str]
        Relation names.
    num_bases : int, optional
        Number of bases. If is none, use number of relations. Default: None.
    weight : bool, optional
        True if a linear layer is applied after message passing. Default: True
    bias : bool, optional
        True if bias is added. Default: True
    activation : callable, optional
        Activation function. Default: None
    self_loop : bool, optional
        True to include self loop message. Default: False
    dropout : float, optional
        Dropout rate. Default: 0.0
    """
    def __init__(self,
                 in_feat,
                 out_feat,
                 rel_names,
                 num_bases,
                 *,
                 weight=True,
                 bias=True,
                 activation=None,
                 self_loop=False,
                 dropout=0.0):
        super(RelGraphConvLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.rel_names = rel_names
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.self_loop = self_loop
        self.conv = dglnn.HeteroGraphConv({
                rel : dglnn.GraphConv(in_feat, out_feat, norm="both", weight=False, bias=False)
#                 rel : dglnn.SAGEConv(in_feat, out_feat, aggregator_type='mean',feat_drop=0.,bias=True,norm=None)
                for rel in rel_names
            })
        self.use_weight = weight
        self.use_basis = num_bases < len(self.rel_names) and weight
        if self.use_weight:
            if self.use_basis:
                self.basis = dglnn.WeightBasis((in_feat, out_feat), num_bases, len(self.rel_names))
            else:
                self.weight = nn.Parameter(th.Tensor(len(self.rel_names), in_feat, out_feat))
                nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
        # bias
        if bias:
            self.h_bias = nn.Parameter(th.Tensor(out_feat))
            nn.init.zeros_(self.h_bias)
        # weight for self loop
        if self.self_loop:
            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
            nn.init.xavier_uniform_(self.loop_weight,
                                    gain=nn.init.calculate_gain('relu'))
        self.dropout = nn.Dropout(dropout)
    def forward(self, g, inputs):
        """Forward computation
        Parameters
        ----------
        g : DGLHeteroGraph
            Input graph.
        inputs : dict[str, torch.Tensor]
            Node feature for each node type.
        Returns
        -------
        dict[str, torch.Tensor]
            New node features for each node type.
        """
        g = g.local_var()
        if self.use_weight:
            weight = self.basis() if self.use_basis else self.weight
            wdict = {self.rel_names[i] : {'weight' : w.squeeze(0)}
                     for i, w in enumerate(th.split(weight, 1, dim=0))}
        else:
            wdict = {}
        if g.is_block:
            inputs_src = inputs
            inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
        else:
            inputs_src = inputs_dst = inputs
        hs = self.conv(g, inputs, mod_kwargs=wdict)
        def _apply(ntype, h):
            if self.self_loop:
                h = h + th.matmul(inputs_dst[ntype], self.loop_weight)
            if self.bias:
                h = h + self.h_bias
            if self.activation:
                h = self.activation(h)
            return self.dropout(h)
        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}
    

class Entity_Classify(nn.Module):
    def __init__(self,
                 g,
                 h_dim,
                 out_dim,
                 num_bases,
#                  embed_layer,
                 num_hidden_layers=1,
                 dropout=0,
                 use_self_loop=False):
        super(Entity_Classify, self).__init__()
        self.g = g
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.rel_names = list(set(g.etypes))
#         self.num_bases = None if num_bases < 0 else num_bases
        if num_bases < 0 or num_bases > len(self.rel_names):
            self.num_bases = len(self.rel_names)
        else:
            self.num_bases = num_bases
            
        self.num_hidden_layers = num_hidden_layers
        self.dropout = dropout
        self.use_self_loop = use_self_loop
        
#         self.node_embed={}
        self.node_embed=nn.ModuleDict()
        self.node_embed['usaanr'] = USAANR_Embedding(self.g,self.h_dim)
#         self.node_embed['zipcode'] = Zipcode_Embedding(self.g,self.h_dim)
        self.layers = nn.ModuleList()
        #i2h
        self.layers.append(RelGraphConvLayer(
                    self.h_dim, self.h_dim, self.rel_names,
                    self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
                    dropout=self.dropout, weight=True))
        # h2h
        if self.num_hidden_layers>1:
            for i in range(0,self.num_hidden_layers-1):
                self.layers.append(RelGraphConvLayer(
                    self.h_dim, self.h_dim, self.rel_names,
                    self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
                    dropout=self.dropout))
        # h2o
#         self.layers.append(RelGraphConvLayer(
#             self.h_dim, self.out_dim, self.rel_names, 
#             self.num_bases, activation=partial(F.softmax, dim=1),
#             self_loop=self.use_self_loop))
        self.classifier = nn.Linear(self.h_dim, self.out_dim)
    
    def forward(self, input_nodes, blocks=None):
        H={}
        for ntype, nid in input_nodes.items():
            nid = input_nodes[ntype]
            H[ntype] = self.node_embed[ntype](nid)
        if blocks is None:
            for layer in self.layers:
                H = layer(self.g, H)
        else:
            for layer, block in zip(self.layers, blocks):
                H = layer(block, H)
        output = self.classifier(H["usaanr"])
    
        return output, H["usaanr"]

In [9]:
def get_class_count_weight(y,n_classes):
    classes_count=[]
    weight=[]
    for i in range(n_classes):
        count=th.sum(y.squeeze()==i).item()
        classes_count.append(count)
        weight.append(len(y)/(n_classes*count))
    return classes_count,weight

In [10]:
def lift_gain_eval(logit,label,topk):
    DF=pd.DataFrame(columns=["pred_score","actual_label"])
    DF["pred_score"]=logit
    DF["actual_label"]=label
    DF.sort_values(by="pred_score", ascending=False, inplace=True)
    gain={}
    for p in topk:
        N=math.ceil(int(DF.shape[0]*p))
        DF2=DF.nlargest(N,"pred_score",keep="first")
        gain[str(int(p*100))+"%"]=round(DF2.actual_label.sum()/(DF.actual_label.sum()*p),2)
    return gain

In [11]:
def eval_loop_func(model, loader, labels, device, loss_weight, num_classes):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    losses=[]
    for input_nodes_raw, seeds, blocks in tqdm(loader, position=0, leave=True):
        blocks = [blk.to(device) for blk in blocks]
        seeds = seeds.to(device)
        input_nodes={}
        input_nodes["usaanr"]=input_nodes_raw
        input_nodes={k : e.to(device) for k, e in input_nodes.items()}
        lbl = labels[seeds].squeeze().to(device)
        with th.no_grad():
            logits,h = model(input_nodes,blocks)
            if loss_weight is None:
                loss = F.cross_entropy(logits.view(-1, num_classes), lbl.to(device))
            else:
                loss = F.cross_entropy(logits.view(-1, num_classes), lbl.to(device),weight=loss_weight.float())
            losses.append(loss.item())
        fin_targets.append(lbl.cpu().detach().numpy())
        fin_outputs.append(logits.cpu().detach().numpy())
    return np.concatenate(fin_outputs), np.concatenate(fin_targets), losses

def evaluate(target, predicted):
    true_label_mask=[1 if (np.argmax(x)-target[i])==0 else 0 for i,x in enumerate(predicted)]
    nb_prediction=len(true_label_mask)
    true_prediction=sum(true_label_mask)
    false_prediction=nb_prediction-true_prediction
    accuracy=true_prediction/nb_prediction
    precision, recall, fscore, support = precision_recall_fscore_support(target, predicted.argmax(axis=1))
    auc = roc_auc_score(target.ravel(), th.sigmoid(th.from_numpy(predicted))[:,1].numpy().ravel())
    prec,rec,_ = precision_recall_curve(target.ravel(), th.sigmoid(th.from_numpy(predicted))[:,1].numpy().ravel())
    pr_auc=auc_score(rec,prec)
    arg1=predicted[:,1]
    arg2=target
    gain = lift_gain_eval(arg1,arg2,topk=[0.01,0.05,0.10])
    return {
        "nb_example":len(target),
        "true_prediction":true_prediction,
        "false_prediction":false_prediction,
        "accuracy":accuracy,
        "precision":precision[1],
        "recall":recall[1],
        "f1_score":fscore[1],
        "AUC":auc,
        "pr_auc":pr_auc,
        "GAIN":gain
    }

In [12]:
# def evaluate(model, loader, labels, category, device):
#     model.eval()
#     total_loss = 0
#     total_acc = 0
#     total_precision=0
#     total_recall=0
#     total_fscore=0
#     total_auc=0
#     total_pr_auc=0
    
#     total_gain={}
#     for p in [0.01,0.05,0.10]:
#         total_gain[str(int(p*100))+"%"]=0
        
#     count = 0
#     count_loss=0
    
#     for input_nodes_raw, seeds_raw, blocks in tqdm(loader, position=0, leave=True):
#         blocks = [blk.to(device) for blk in blocks]
        
#         seeds=seeds_raw.to(device)

#         input_nodes={}
#         input_nodes[category]=input_nodes_raw
#         input_nodes={k : e.to(device) for k, e in input_nodes.items()}

#         lbl = labels[seeds].to(device)
#         logits,h = model(input_nodes,blocks)
#         loss = F.cross_entropy(logits, lbl.squeeze(1).to(device))
#         loss = F.cross_entropy(logits, lbl.squeeze(1),weight=th.Tensor([1,args.weight]).to(device))

#         acc = th.sum(logits.argmax(dim=1) == lbl.squeeze(1)).item() / logits.shape[0]
#         precision, recall, fscore, support = score(lbl.squeeze(1).cpu().numpy(), logits.argmax(dim=1).cpu().numpy())

      
#         auc = roc_auc_score(lbl.detach().cpu().numpy().ravel(), th.sigmoid(logits)[:,1].detach().cpu().numpy().ravel())
#         prec,rec,_ = precision_recall_curve(lbl.detach().cpu().numpy().ravel(), th.sigmoid(logits)[:,1].detach().cpu().numpy().ravel())
#         pr_auc=auc_score(rec,prec)
        
#         total_loss += loss.item() * len(seeds) 
#         total_acc += acc
#         total_precision += precision[1]
#         total_recall += recall[1]
#         total_fscore += fscore[1]
#         total_auc += auc
#         total_pr_auc += pr_auc
#         count += 1
#         count_loss += len(seeds)
        
#         arg1=logits[:,1].detach().cpu().numpy()
#         arg2=lbl.cpu().numpy()
#         gain = lift_gain_eval(arg1,arg2,topk=[0.01,0.05,0.10])
#         for k in gain.keys():
#             total_gain[k] += gain[k] 
    
#     GAIN={}
#     for k in total_gain.keys():
#         GAIN[k]=total_gain[k]/count
    
#     ACCURACY=total_acc / count
#     LOSS=total_loss / count_loss
#     AUC=total_auc/count
#     PR_AUC=total_pr_auc/count
#     PRECISION=total_precision/count
#     RECALL=total_recall/count
#     F1_SCORE=total_fscore/count
    
#     return LOSS, ACCURACY, PRECISION, RECALL, F1_SCORE, GAIN, AUC, PR_AUC


#### create subgraph for the purpose of preliminary test

In [13]:
# dict_nodes={"usaanr":th.arange(G.num_nodes('usaanr'))[0:1000],'zipcode':th.arange(G.num_nodes('zipcode'))[0:100]}
# sg=dgl.node_subgraph(G,dict_nodes)

# G=G.node_type_subgraph(['usaanr'])
# dict_edges={}
# for etype in G.etypes:
#     dict_edges[etype]=th.arange(G.num_edges(etype))[0:5000]
# G=dgl.edge_subgraph(G,dict_edges)

# G.nodes['usaanr'].data["_ID"].numpy().shape,binary_label.shape, binary_label[G.nodes['usaanr'].data["_ID"]].shape

#### settings

In [14]:
parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--dropout", type=float, default=0,
        help="dropout probability")
parser.add_argument("--h_dim", type=int, default=128,
        help="number of hidden units")
parser.add_argument("--out_dim", type=int, default=1,
        help="output dimension")
parser.add_argument("--gpu", type=int, default=0,
        help="gpu")
parser.add_argument("--lr", type=float, default=1e-5,
        help="learning rate")
parser.add_argument("--num_bases", type=int, default=-1,
        help="number of filter weight matrices, default: -1 [use all]")
parser.add_argument("--num_layers", type=int, default=1,
        help="number of propagation rounds")
parser.add_argument("-e", "--n_epochs", type=int, default=1,
        help="number of training epochs")
# parser.add_argument("--model_path", type=str, default="/workspace/cjiang/eagle_project/CAP_graph/CAP_without_zipcode/rgcn_model_param.pt",
#         help='path for save the model')
parser.add_argument("--l2norm", type=float, default=0,
        help="l2 norm coef")
parser.add_argument("--use_self_loop", default=True, action='store_true',
        help="include self feature as a special relation")
parser.add_argument("--batch-size", type=int, default=1024,
        help="Mini-batch size. If -1, use full graph training.")
parser.add_argument("--num_mini_batch", type=int, default=8,
        help="Number of minibatch.")
parser.add_argument("--fanout", type=int, default=None,
        help="Fan-out of neighbor sampling.")
parser.add_argument("--validation",  default=True,
        help="set up validation .")
parser.add_argument("--seed",  type=int,default=101,
        help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")

parser.add_argument("--loss_weight",  type=bool,default=True,  ## number of label=0/number of label=1
        help="weight for unbalance data")
parser.add_argument("--num_worker",  type=int,default=0,  
        help="number of worker for neighbor sampling") 

args,unknown=parser.parse_known_args()

args.num_layers=1
args.dropout=0.2
args.lr=1e-3
args.l2norm=1e-3
args.n_epochs=1
args.num_bases=5
args.h_dim=64
args.batch_size=10240
print(args)

Namespace(batch_size=10240, dropout=0.2, fanout=None, gpu=0, h_dim=64, l2norm=0.001, loss_weight=True, lr=0.001, n_epochs=1, num_bases=5, num_layers=1, num_mini_batch=8, num_worker=0, out_dim=1, seed=101, use_self_loop=True, validation=True)


In [15]:
num_rels=len(G.etypes)
# LABEL=th.tensor(binary_label[G.nodes['usaanr'].data["_ID"]]).long()
LABEL=th.tensor(binary_label).long()
labels, count=th.unique(LABEL,return_counts=True)
num_classes=labels.shape[0]
pd.DataFrame({"label_class":labels, "count":count}).style.format({'count':'{:,}'})

Unnamed: 0,label_class,count
0,0,19358913
1,1,6309591


In [16]:
# train_mask=train_mask_binary_label[G.nodes['usaanr'].data["_ID"]] 
# val_mask=val_mask_binary_label[G.nodes['usaanr'].data["_ID"]]
# test_mask=test_mask_binary_label[G.nodes['usaanr'].data["_ID"]]

train_mask=train_mask_binary_label  
val_mask=val_mask_binary_label
test_mask=test_mask_binary_label

train_idx=th.nonzero(train_mask.squeeze()).numpy()
val_idx=th.nonzero(val_mask.squeeze()).numpy()
test_idx=th.nonzero(test_mask.squeeze()).numpy()

train_idx=th.from_numpy(train_idx).squeeze(1)    
val_idx=th.from_numpy(val_idx).squeeze(1)    
test_idx=th.from_numpy(test_idx).squeeze(1)

train_label=LABEL[train_idx]
val_label=LABEL[val_idx]
test_label=LABEL[test_idx]

print('{:<15} {:<10,}'.format("Training set",train_idx.shape[0]))
print('{:<15} {:<10,}'.format("validation set",val_idx.shape[0]))
print('{:<15} {:<10,}'.format("test set",test_idx.shape[0]))

Training set    20,534,804
validation set  2,566,850 
test set        2,566,850 


In [17]:
assert train_idx.shape[0]+val_idx.shape[0]+test_idx.shape[0] == G.num_nodes('usaanr')

In [18]:
##### check cuda
device="cpu"
use_cuda=args.gpu>=0 and th.cuda.is_available()
if use_cuda:
    th.cuda.set_device(args.gpu)
    device='cuda:%d' % args.gpu
print(device)

cuda:0


In [19]:
if args.loss_weight:
    train_classes_num,train_classes_weight=get_class_count_weight(train_label.squeeze(),num_classes)
    loss_weight=th.tensor(train_classes_weight).to(device)
else:
    loss_weight=None

In [20]:
# create model
model = Entity_Classify(G,
                       args.h_dim,
                       num_classes,
                       num_bases=args.num_bases,
                       num_hidden_layers=args.num_layers,
                       dropout=args.dropout,
                       use_self_loop=args.use_self_loop)
if use_cuda:
    model.cuda()

model.layers

ModuleList(
  (0): RelGraphConvLayer(
    (conv): HeteroGraphConv(
      (mods): ModuleDict(
        (AUTO_RELATED): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Brother_Sister): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Busi_rel_Other): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Child): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Ex-Spouse): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Parent): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Pers_rel_Other): GraphConv(in=64, out=64, normalization=both, activation=None)
        (SPONSEE): GraphConv(in=64, out=64, normalization=both, activation=None)
        (SPONSOR): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Spouse): GraphConv(in=64, out=64, normalization=both, activation=None)
        (Step-Child): GraphConv(in=64, out=64, normalization=both, activa

In [21]:
optimizer = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm)

In [22]:
# train sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.num_layers)
train_loader = dgl.dataloading.NodeDataLoader(
    G, {'usaanr': train_idx}, sampler,
    batch_size=args.batch_size, shuffle=True, num_workers=args.num_worker)
# validation sampler
# we do not use full neighbor to save computation resources
val_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.num_layers)
val_loader = dgl.dataloading.NodeDataLoader(
    G, {'usaanr': val_idx}, val_sampler,
    batch_size=args.batch_size, shuffle=False, num_workers=args.num_worker)

test_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.num_layers)
test_loader = dgl.dataloading.NodeDataLoader(
    G, {'usaanr': test_idx}, test_sampler,
    batch_size=args.batch_size, shuffle=False, num_workers=args.num_worker)

print("The number of minibatch in training set is {:,}".format(len(train_loader)))
print("The number of minibatch in validation set is {:,}".format(len(val_loader)))
print("The number of minibatch in test set is {:,}".format(len(test_loader)))

The number of minibatch in training set is 2,006
The number of minibatch in validation set is 251
The number of minibatch in test set is 251


In [23]:
print("The total # of parameter is {:,}".format(sum([p.nelement() for p in model.parameters()]) ) )

The total # of parameter is 42,750


In [24]:
param_dict={n: p.nelement() for n, p in model.named_parameters()}
for i,j in param_dict.items():
    print("{:<70}{:<15,}".format(i,j))

node_embed.usaanr.emb.usaayr.weight                                   5,440          
node_embed.usaanr.emb.AGE_BAND.weight                                 448            
node_embed.usaanr.emb.ORIGEL.weight                                   2,752          
node_embed.usaanr.emb.ELIG2.weight                                    384            
node_embed.usaanr.emb.cmpyelig.weight                                 320            
node_embed.usaanr.emb.SEX.weight                                      320            
node_embed.usaanr.emb.MARST.weight                                    640            
node_embed.usaanr.emb.BRANCH.weight                                   1,088          
node_embed.usaanr.emb.MILST.weight                                    512            
node_embed.usaanr.emb.MLIST_OrigStat.weight                           192            
node_embed.usaanr.emb.ENLPAYGD.weight                                 1,600          
node_embed.usaanr.emb.ACTCORP.weight                  

In [25]:
LOSS_EPOCH=[]
LABEL_TRAIN=[]
# training loop
print("start training...")
dur = []
total_loss=0
losses=[]

LOGIT_train=[]
LABEL_train=[]

for epoch in tqdm(range(0,args.n_epochs)):
    
    model.train()
    IDX=[]
    H=[]
    LABEL_train=[]
    
    #====================================#
    #            Traning                 #
    #====================================#
    print("")
    print("========= Epoch {:} /{:}".format(epoch+1,args.n_epochs))
    print("Training...")
    t0 = time.time()
    for step, (input_nodes_raw, seeds_raw, blocks) in enumerate(train_loader):
        blocks = [blk.to(device) for blk in blocks]
        
        seeds=seeds_raw.to(device)
        
        labels_train=LABEL[seeds]       
        labels_train = labels_train.to(device)
        
        input_nodes={}
        input_nodes["usaanr"]=input_nodes_raw
        input_nodes={k : e.to(device) for k, e in input_nodes.items()}
        
        logits,h = model(input_nodes,blocks)
        optimizer.zero_grad()
        if args.loss_weight:
            loss = F.cross_entropy(logits.view(-1,num_classes), labels_train.squeeze().to(device),weight=loss_weight.float().to(device))
        else:
            loss = F.cross_entropy(logits.view(-1,num_classes), labels_train.squeeze().to(device))
        total_loss+=loss.item()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
        arg1=logits[:,1].detach().cpu().numpy()
        arg2=labels_train.cpu().numpy()
                
        train_gain = lift_gain_eval(arg1,arg2,topk=[0.01,0.05,0.10])
        train_acc = th.sum(logits.argmax(dim=1) == labels_train).item() / len(seeds)
        precision, recall, fscore, support = precision_recall_fscore_support(labels_train.cpu().numpy(), logits.argmax(dim=1).cpu().numpy())
        try:
            train_auc = roc_auc_score(labels_train.detach().cpu().numpy().ravel(), th.sigmoid(logits)[:,1].detach().cpu().numpy().ravel())
        except ValueError:
            pass
        prec,rec,_ = precision_recall_curve(labels_train.detach().cpu().numpy().ravel(), th.sigmoid(logits)[:,1].detach().cpu().numpy().ravel())
        if math.isnan(rec[0])==False:
            train_pr_auc=auc_score(rec,prec)
        IDX.extend(seeds.detach().cpu().numpy().tolist())
        H.extend(h.detach().cpu().numpy().tolist())
        LOGIT_train.extend(logits.detach().cpu().numpy().tolist())
        LABEL_train.extend(binary_label[blocks[-1].dstdata[dgl.NID].cpu().numpy()].tolist())
        if step%(len(train_loader)//10)==0 and not step==0:
            t1 = time.time()
            elapsed=utils.format_time(t1-t0)
            print("Batch {:} of {:} | Loss {:.3f}  | Elapsed: {:}".\
                  format(step,len(train_loader),np.mean(losses[-10:]),elapsed))
    LOSS_EPOCH.append(loss)
    LABEL_TRAIN.append(binary_label[blocks[-1].ndata[dgl.NID]['usaanr'].cpu().numpy()])
    model.eval()
    print()
    print("")
    print("Running Validation on training set")
    print("")
    fin_outputs, fin_targets, losses_tmp=eval_loop_func(model, train_loader, binary_label,  device, loss_weight, num_classes)
    avg_loss_train=np.mean(losses_tmp)
    tmp_mean_pool_train=evaluate(fin_targets.reshape(-1),fin_outputs)
    t2=time.time()
    print("avg_loss: {:.2f} | True_Prediction: {:,} | False_Prediction: {:,} | accuracy: {:.2%} |  precision: {:.2%} | recall: {:.2%} | F1_score: {:.2%} | Gain_top-10%: {:.1f} |\
    ROC_AUC: {:.1%} | PR_AUC: {:.1%} | Elapsed: {:}".format(avg_loss_train, tmp_mean_pool_train["true_prediction"], tmp_mean_pool_train["false_prediction"], tmp_mean_pool_train["accuracy"], \
                                                            tmp_mean_pool_train["precision"], tmp_mean_pool_train["recall"], tmp_mean_pool_train["f1_score"], tmp_mean_pool_train["GAIN"]['10%'], \
                                                            tmp_mean_pool_train["AUC"], tmp_mean_pool_train["pr_auc"], utils.format_time(t2-t1)))
    #====================================#
    #            Validation-set          #
    #====================================#
    model.eval()
    print()
    print("")
    print("Running Validation on validation set")
    print("")
    fin_outputs, fin_targets, losses_tmp=eval_loop_func(model, val_loader, binary_label,  device, loss_weight, num_classes)
    avg_loss_val=np.mean(losses_tmp)
    tmp_mean_pool_val=evaluate(fin_targets.reshape(-1),fin_outputs)
    t3=time.time()
    print("avg_loss: {:.2f} | True_Prediction: {:,} | False_Prediction: {:,} | accuracy: {:.2%} |  precision: {:.2%} | recall: {:.2%} | F1_score: {:.2%} | Gain_top-10%: {:.1f} |\
    ROC_AUC: {:.1%} | PR_AUC: {:.1%} | Elapsed: {:}".format(avg_loss_val, tmp_mean_pool_val["true_prediction"], tmp_mean_pool_val["false_prediction"], tmp_mean_pool_val["accuracy"], \
                                                            tmp_mean_pool_val["precision"], tmp_mean_pool_val["recall"], tmp_mean_pool_val["f1_score"], tmp_mean_pool_val["GAIN"]['10%'], \
                                                            tmp_mean_pool_val["AUC"], tmp_mean_pool_val["pr_auc"], utils.format_time(t3-t2)))
# if args.model_path is not None:
#     th.save(model.state_dict(), args.model_path)
#====================================#
#            Test-set                #
#====================================#
print()
print("")
print("Running Validation in Test Dataset")
print("")
model.eval()
fin_outputs, fin_targets, losses_tmp=eval_loop_func(model, test_loader, binary_label,  device, loss_weight, num_classes)
avg_loss_test=np.mean(losses_tmp)
tmp_mean_pool_test=evaluate(fin_targets.reshape(-1),fin_outputs)
t4=time.time()
print("avg_loss: {:.2f} | True_Prediction: {:,} | False_Prediction: {:,} | accuracy: {:.2%} |  precision: {:.2%} | recall: {:.2%} | F1_score: {:.2%} | Gain_top-10%: {:.1f} |\
ROC_AUC: {:.1%} | PR_AUC: {:.1%} | Elapsed: {:}".format(avg_loss_test, tmp_mean_pool_test["true_prediction"], tmp_mean_pool_test["false_prediction"], tmp_mean_pool_test["accuracy"], \
                                                        tmp_mean_pool_test["precision"], tmp_mean_pool_test["recall"], tmp_mean_pool_test["f1_score"], tmp_mean_pool_test["GAIN"]['10%'], \
                                                        tmp_mean_pool_test["AUC"], tmp_mean_pool_test["pr_auc"], utils.format_time(t4-t3)))

  0%|          | 0/1 [00:00<?, ?it/s]

start training...

Training...
Batch 200 of 2006 | Loss 0.681  | Elapsed: 0:01:23
Batch 400 of 2006 | Loss 0.676  | Elapsed: 0:02:20
Batch 600 of 2006 | Loss 0.675  | Elapsed: 0:03:22
Batch 800 of 2006 | Loss 0.675  | Elapsed: 0:04:23
Batch 1000 of 2006 | Loss 0.673  | Elapsed: 0:05:19
Batch 1200 of 2006 | Loss 0.673  | Elapsed: 0:06:15
Batch 1400 of 2006 | Loss 0.672  | Elapsed: 0:07:30
Batch 1600 of 2006 | Loss 0.672  | Elapsed: 0:08:17
Batch 1800 of 2006 | Loss 0.670  | Elapsed: 0:09:28
Batch 2000 of 2006 | Loss 0.671  | Elapsed: 0:10:46


  0%|          | 0/2006 [00:00<?, ?it/s]



Running Validation on training set



100%|██████████| 2006/2006 [05:11<00:00,  6.44it/s]  
  0%|          | 1/251 [00:00<00:28,  8.68it/s]

avg_loss: 0.67 | True_Prediction: 13,388,379 | False_Prediction: 7,146,425 | accuracy: 65.20% |  precision: 34.72% | recall: 47.25% | F1_score: 40.03% | Gain_top-10%: 1.4 |    ROC_AUC: 61.5% | PR_AUC: 32.0% | Elapsed: 0:07:00


Running Validation on validation set



 88%|████████▊ | 221/251 [00:23<00:03,  9.51it/s]

KeyboardInterrupt: 