In [1]:
from collections  import OrderedDict
import copy
import argparse
import itertools
import os
import numpy as np
from numpy import save,load,savetxt,loadtxt,savez_compressed
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc as auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder, label_binarize
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, sum_models

import pandas as pd
import scipy.sparse as sp
import time
from tqdm import tqdm, tqdm_notebook,tnrange
tqdm.pandas(position=0, leave=True)
import math 
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl import edge_subgraph
from dgl.nn.functional import edge_softmax
import dgl.nn as dglnn
import dgl.function as fn

from GraphSage_Model import *
from evaluation import *
from inductive_graph import *
from graph_to_dataframe import *
from MLP_Model import *
from print_func import *

from MLP_run import *
from catboost_run import *
from GraphSage_run import *
from GraphSage_featureless_run import *

import functools
import seaborn as sns
import pickle
import random

import warnings
warnings.filterwarnings('ignore')
import utils

print("torch version is {}".format(th.__version__))
print("DGL version is {}".format(dgl.__version__))


def seed_everything(seed):
    random.seed(seed)
    th.manual_seed(seed)
    th.cuda.manual_seed_all(seed)
    th.backends.cudnn.deterministic = True
    th.backends.cudnn.benchmark = False
    np.random.seed(seed)
    dgl.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

Using backend: pytorch


torch version is 1.7.0
DGL version is 0.8a210831


In [2]:
if __name__=="__main__":
    
    parser = argparse.ArgumentParser(description='RGCN')
    parser.add_argument("--dropout", type=float, default=0.2,
            help="dropout probability")
    parser.add_argument("--h_dim", type=int, default=64,
            help="number of hidden units")
#     parser.add_argument("--out_dim", type=int, default=1,
#             help="output dimension")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-3,
            help="learning rate")
    parser.add_argument("--featureless_lr", type=float, default=1e-4,
            help='Learning Rate for featureless graph model')
    parser.add_argument("--num_bases", type=int, default=5,
            help="number of filter weight matrices, default: -1 [use all]")
    parser.add_argument("--num_layers", type=int, default=1,
            help="number of propagation rounds")
    parser.add_argument("-e", "--n_epochs", type=int, default=5,
            help="number of training epochs")
#     parser.add_argument("--model_path", type=str, default="/workspace/cjiang/eagle_project/CAP_graph/CAP_without_zipcode/rgcn_model_param.pt",
#             help='path for save the model')
    parser.add_argument("--l2norm", type=float, default=1e-3,
            help="l2 norm coef")
    parser.add_argument("--use_self_loop", default=True, action='store_true',
            help="include self feature as a special relation")
    parser.add_argument("--batch-size", type=int, default=1024,
            help="Mini-batch size. If -1, use full graph training.")
    parser.add_argument("--fanout", type=int, default=15,
            help="Fan-out of neighbor sampling.")
    parser.add_argument("--seed",  type=int,default=101,
            help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")
    parser.add_argument("--loss_weight",  type=bool,default=True,  ## number of label=0/number of label=1
            help="weight for unbalance data")
    parser.add_argument("--num_worker",  type=int,default=0,  
            help="number of worker for neighbor sampling") 
    parser.add_argument("--train_test_split", type=float, default=0.1,
            help="the proportion of test dataset")
    
    parser.add_argument("--loss_function", type=str, default="MultiClass",
            help='Loss function for Catboost')    
    parser.add_argument("--CatBoost_LR", type=float, default=0.01,
            help='Learning Rate for Catboost')  
    parser.add_argument("--iterations", type=int, default=3000,
            help='epochs iterations for Catboost')  
    parser.add_argument("--early_stopping", type=int, default=200,
            help='early_stopping rounds for Catboost') 
    parser.add_argument("--device_type", type=str, default="GPU",
            help='GPU utilization for Catboost training')      
    parser.add_argument("--verbose", type=int, default=200,
            help='verbose details for Catboost training')  
    
    args,_=parser.parse_known_args()
    
    args.batch_size=1024
    args.train_test_split=0.10
    args.n_epochs=1
    print(args)
    print()
    
    seed_everything(args.seed)
    
    
    KG_dir="/workspace/cjiang/eagle_project/CAP_graph/BGNN/"

    start=time.time()
    with open(os.path.join(KG_dir,'CAP_Graph_v1'), 'rb') as f:
        G,multi_label,binary_label,\
        train_mask_multi_label,  val_mask_multi_label,  test_mask_multi_label,\
        train_mask_binary_label, val_mask_binary_label, test_mask_binary_label= pickle.load(f)
    end=time.time()
    print("It took {:0.4f} seconds to load graph".format(end-start))

    usaanr_feat=[]
    for key, scheme in G.node_attr_schemes(ntype="usaanr").items():
        usaanr_feat.append(key)

    usaanr_feat=[x for x in usaanr_feat if x not in 
                 ['usaanr','cmpyelig','ACTCORP','Segment','train_mask','val_mask','test_mask','label','_ID']]

    print()
    print("The features associated with USAA Member are\n ")
    for i in usaanr_feat:
        print(i)
    print()
    
    G.nodes['usaanr'].data['label']=binary_label
    
    dict_edges={}
    for etype in G.etypes:
        dict_edges[etype]=th.arange(G.num_edges(etype))[0:2000]
    sg=dgl.edge_subgraph(G,dict_edges)
    G=copy.deepcopy(sg)
    
    subgraph_class=create_inductive_graph(G,args.train_test_split,args.seed)
    train_g, test_g=subgraph_class.subgraph_func()
    test_idx=subgraph_class.nodes_idx(train_g,test_g)
    
    assert train_g.num_nodes()+test_idx.shape[0]==test_g.num_nodes()
    
    device="cpu"
#     use_cuda=args.gpu>=0 and th.cuda.is_available()
#     if use_cuda:
#         th.cuda.set_device(args.gpu)
#         device='cuda:%d' % args.gpu
    
    data=G, train_g, test_g, test_idx
    
    %pdb
#     train_graph_v1, test_graph_v1=graph_run_featureless(args,usaanr_feat,device,data)    
#     train_graph_v2, test_graph_v2=graph_run(args,usaanr_feat,device,data)
#     train_catboost, test_catboost=catboost_run(args,device,data)
#     train_mlp, test_mlp=MLP_run(args,usaanr_feat,device,data)
    
#     print()
#     func_print(train_catboost, train_mlp, train_graph_v1, train_graph_v2, "train_output.txt")
#     print()
#     func_print(test_catboost, test_mlp, test_graph_v1, test_graph_v2, "test_output.txt")

Namespace(CatBoost_LR=0.01, batch_size=1024, device_type='GPU', dropout=0.2, early_stopping=200, fanout=15, featureless_lr=0.0001, gpu=0, h_dim=64, iterations=3000, l2norm=0.001, loss_function='MultiClass', loss_weight=True, lr=0.001, n_epochs=1, num_bases=5, num_layers=1, num_worker=0, seed=101, train_test_split=0.1, use_self_loop=True, verbose=200)

It took 11.2639 seconds to load graph

The features associated with USAA Member are
 
usaayr
AGE_BAND
ORIGEL
ELIG2
SEX
MARST
BRANCH
ENLPAYGD
MILST
MLIST_OrigStat
STATE

Automatic pdb calling has been turned ON


In [3]:
G, train_g, test_g, test_idx=data

train_idx=th.arange(train_g.num_nodes()).squeeze()       
test_idx=th.from_numpy(test_idx).squeeze()

train_label=train_g.nodes['usaanr'].data['label']
test_label=test_g.nodes['usaanr'].data['label']

label_train=train_label.squeeze().numpy()
label_test=test_label.squeeze().numpy()

print('{:<15} {:<10,}'.format("Training set",train_idx.shape[0]))
print('{:<15} {:<10,}'.format("test set",test_idx.shape[0]))
print()

num_classes=th.unique(th.from_numpy(label_train)).shape[0]
if args.loss_weight:
    train_classes_num, train_classes_weight = get_class_count_weight(label_train,num_classes)
    loss_weight=th.tensor(train_classes_weight).to(device)
else:
    loss_weight=None

model = Entity_Classify_FeatureLess(G,
                                    device,
                                    args.h_dim,
                                    num_classes,
                                    num_bases=args.num_bases,
                                    num_hidden_layers=args.num_layers,
                                    dropout=args.dropout,
                                    use_self_loop=args.use_self_loop)
if device !="cpu":
    model.cuda()

optimizer = th.optim.Adam(model.parameters(), lr=args.featureless_lr, weight_decay=args.l2norm)

# train sampler
train_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.num_layers)
train_loader = dgl.dataloading.NodeDataLoader(
    train_g, {'usaanr': train_idx}, train_sampler,
    batch_size=args.batch_size, shuffle=True, num_workers=args.num_worker)

test_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.num_layers)
test_loader = dgl.dataloading.NodeDataLoader(
    test_g, {'usaanr': test_idx}, test_sampler,
    batch_size=args.batch_size, shuffle=False, num_workers=args.num_worker)

print("The number of minibatch in training set is {:,}".format(len(train_loader)))
print("The number of minibatch in test set is {:,}".format(len(test_loader)))
print()


#### Training Loop
print()
print("***************************************************************** ")
print("========= Training Loop For Graph Model without feature ========= ")
print("***************************************************************** ")
print()

LOSS_EPOCH=[]
LABEL_TRAIN=[]
total_loss=0
losses=[]
LOGIT_train=[]
LABEL_train=[]

for epoch in tqdm(range(0,args.n_epochs)):

    model.train()
    IDX=[]
    H=[]

    #====================================#
    #            Traning                 #
    #====================================#
    print("")
    print("========= Epoch {:} /{:}".format(epoch+1,args.n_epochs))
    print("Training...")
    t0 = time.time()
    for step, (input_nodes_raw, seeds_raw, blocks) in enumerate(train_loader):
        blocks = [blk.to(device) for blk in blocks]

        seeds=seeds_raw.to(device)

        labels_train=train_label[seeds].to(device)       

        input_nodes={}
        input_nodes["usaanr"]=input_nodes_raw
        input_nodes={k : e.to(device) for k, e in input_nodes.items()}

        logits,h = model(train_g,input_nodes,blocks)
        optimizer.zero_grad()

        if args.loss_weight :
            loss = F.cross_entropy(logits.view(-1, num_classes), 
                                   labels_train.squeeze().to(device),weight=loss_weight.float().to(device))
        else:
            loss = F.cross_entropy(logits.view(-1, num_classes), labels_train.squeeze().to(device))

        total_loss+=loss.item()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

        arg1=logits[:,1].detach().cpu().numpy()
        arg2=labels_train.cpu().numpy()

        train_gain = lift_gain_eval(arg1,arg2,topk=[0.01,0.05,0.10])

        train_acc = th.sum(logits.argmax(dim=1) == labels_train).item() / len(seeds)
        precision, recall, fscore, support = precision_recall_fscore_support(labels_train.cpu().numpy(), 
                                                                             logits.argmax(dim=1).cpu().numpy())

        try:
            train_auc = roc_auc_score(labels_train.detach().cpu().numpy().ravel(), th.sigmoid(logits)\
                                      [:,1].detach().cpu().numpy().ravel())
        except ValueError:
            pass

        prec,rec,_ = precision_recall_curve(labels_train.detach().cpu().numpy().ravel(), th.sigmoid(logits)\
                                            [:,1].detach().cpu().numpy().ravel())
        if math.isnan(rec[0])==False:
            train_pr_auc=auc_score(rec,prec)

        IDX.extend(seeds.detach().cpu().numpy().tolist())
        H.extend(h["usaanr"].detach().cpu().numpy().tolist())
        LOGIT_train.extend(logits.detach().cpu().numpy().tolist())
        LABEL_train.extend(train_label[blocks[-1].dstnodes['usaanr'].data[dgl.NID].cpu().numpy()].tolist())

        if step%(len(train_loader)//10)==0 and not step==0:

            t1 = time.time()
            elapsed=utils.format_time(t1-t0)
            print("Batch {:} of {:} | Loss {:.3f}  | Elapsed: {:}".\
                  format(step,len(train_loader),np.mean(losses[-10:]),elapsed)) 

    LOSS_EPOCH.append(loss)

    LABEL_TRAIN.append(train_label[blocks[-1].nodes['usaanr'].data[dgl.NID].cpu().numpy()])


    model.eval()
    print()
    print("")
    print("Running Validation on training set")
    print("")
    fin_outputs, fin_targets, losses_tmp=eval_loop_func(train_g, model, train_loader, train_label,  device, loss_weight, num_classes)

    avg_loss_train=np.mean(losses_tmp)

    tmp_mean_pool_train=evaluate(fin_targets.reshape(-1),fin_outputs)

    t2=time.time()

    print("avg_loss: {:.2f} | True_Prediction: {:,} | False_Prediction: {:,} | accuracy: {:.2%} |  precision: {:.2%} | recall: \
        {:.2%} | F1_score: {:.2%} | Gain_top-10%: {:.1f} | ROC_AUC: {:.1%} | PR_AUC: {:.1%} | Elapsed: {:}".format(avg_loss_train, 
          tmp_mean_pool_train["true_prediction"], tmp_mean_pool_train["false_prediction"], tmp_mean_pool_train["accuracy"], \
          tmp_mean_pool_train["precision"], tmp_mean_pool_train["recall"],tmp_mean_pool_train["f1_score"], \
          tmp_mean_pool_train["GAIN"]['10%'], tmp_mean_pool_train["AUC"],tmp_mean_pool_train["pr_auc"],utils.format_time(t2-t1)))


  0%|          | 0/1 [00:00<?, ?it/s]

Training set    22,302    
test set        4,811     

The number of minibatch in training set is 22
The number of minibatch in test set is 5


***************************************************************** 
***************************************************************** 


Training...
Batch 2 of 22 | Loss 1.356  | Elapsed: 0:00:01
Batch 4 of 22 | Loss 1.353  | Elapsed: 0:00:01
Batch 6 of 22 | Loss 1.361  | Elapsed: 0:00:01
Batch 8 of 22 | Loss 1.356  | Elapsed: 0:00:02
Batch 10 of 22 | Loss 1.345  | Elapsed: 0:00:02
Batch 12 of 22 | Loss 1.340  | Elapsed: 0:00:03
Batch 14 of 22 | Loss 1.320  | Elapsed: 0:00:03
Batch 16 of 22 | Loss 1.289  | Elapsed: 0:00:03
Batch 18 of 22 | Loss 1.282  | Elapsed: 0:00:03


  0%|          | 0/22 [00:00<?, ?it/s]

Batch 20 of 22 | Loss 1.277  | Elapsed: 0:00:04


Running Validation on training set



100%|██████████| 22/22 [00:01<00:00, 13.72it/s]
100%|██████████| 1/1 [00:05<00:00,  5.62s/it]

avg_loss: 1.11 | True_Prediction: 11,354 | False_Prediction: 10,948 | accuracy: 50.91% |  precision: 43.26% | recall:         44.53% | F1_score: 43.89% | Gain_top-10%: 0.1 | ROC_AUC: 49.7% | PR_AUC: 43.0% | Elapsed: 0:00:02





In [4]:
fin_outputs, fin_targets, losses_tmp=eval_loop_func(test_g, model, test_loader, test_label,  device, loss_weight, num_classes)

  0%|          | 0/5 [00:00<?, ?it/s]


IndexError: index out of range in self

> [0;32m/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py[0m(1852)[0;36membedding[0;34m()[0m
[0;32m   1850 [0;31m        [0;31m# remove once script supports set_grad_enabled[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1851 [0;31m        [0m_no_grad_embedding_renorm_[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mmax_norm[0m[0;34m,[0m [0mnorm_type[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1852 [0;31m    [0;32mreturn[0m [0mtorch[0m[0;34m.[0m[0membedding[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mpadding_idx[0m[0;34m,[0m [0mscale_grad_by_freq[0m[0;34m,[0m [0msparse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1853 [0;31m[0;34m[0m[0m
[0m[0;32m   1854 [0;31m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/lib/python3.6/site-packages/torch/nn/modules/sparse.py[0m(126)[0;36mforward[0;34m()[0m
[0;32m    124 [0;31m        return F.embedding(
[0m[0;32m    125 [0;31m            [0minput[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mweight[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mpadding_idx[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mmax_norm[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 126 [0;31m            self.norm_type, self.scale_grad_by_freq, self.sparse)
[0m[0;32m    127 [0;31m[0;34m[0m[0m
[0m[0;32m    128 [0;31m    [0;32mdef[0m [0mextra_repr[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/workspace/cjiang/eagle_project/CAP_graph/inductive_learning/GraphSage_Model.py[0m(37)[0;36mforward[0;34m()[0m
[0;32m     35 [0;31m[0;31m#         self.emb=self.emb.to("cpu")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m        [0midx[0m[0;34m=[0m[0msg[0m[0;34m.[0m[0mnodes[0m[0;34m[[0m[0;34m'usaanr'[0m[0;34m][0m[0;34m.[0m[0mdata[0m[0;34m[[0m[0mdgl[0m[0;34m.[0m[0mNID[0m[0;34m][0m[0;34m[[0m[0mnid[0m[0;34m][0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m        [0mout_feature[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0memb[0m[0;34m[[0m[0;34m'usaanr'[0m[0;34m][0m[0;34m([0m[0midx[0m[0;34m)[0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   

ipdb>  idx.shape


torch.Size([1203])


ipdb>  idx.max()


tensor(22033712)


ipdb>  self.emb['usaanr'].weight.shape


torch.Size([27114, 64])


ipdb>  nid.shape


torch.Size([1203])


ipdb>  nid.max()


tensor(22205)


ipdb>  q


In [None]:
# model.eval()
# fin_targets=[]
# fin_outputs=[]
# losses=[]
# input_nodes_raw, seeds_raw, blocks =next(iter(test_loader))

# blocks = [blk.to(device) for blk in blocks]
# seeds = seeds_raw.to(device)

# input_nodes={}
# input_nodes["usaanr"]=input_nodes_raw
# input_nodes={k : e.to(device) for k, e in input_nodes.items()}

# lbl = labels[seeds].squeeze().to(device)

In [6]:
G

Graph(num_nodes={'usaanr': 27113},
      num_edges={('usaanr', 'AUTO_RELATED', 'usaanr'): 2000, ('usaanr', 'Brother_Sister', 'usaanr'): 2000, ('usaanr', 'Busi_rel_Other', 'usaanr'): 2000, ('usaanr', 'Child', 'usaanr'): 2000, ('usaanr', 'Ex-Spouse', 'usaanr'): 2000, ('usaanr', 'Parent', 'usaanr'): 2000, ('usaanr', 'Pers_rel_Other', 'usaanr'): 2000, ('usaanr', 'SPONSEE', 'usaanr'): 2000, ('usaanr', 'SPONSOR', 'usaanr'): 2000, ('usaanr', 'Spouse', 'usaanr'): 2000, ('usaanr', 'Step-Child', 'usaanr'): 2000, ('usaanr', 'Step-Parent', 'usaanr'): 2000},
      metagraph=[('usaanr', 'usaanr', 'AUTO_RELATED'), ('usaanr', 'usaanr', 'Brother_Sister'), ('usaanr', 'usaanr', 'Busi_rel_Other'), ('usaanr', 'usaanr', 'Child'), ('usaanr', 'usaanr', 'Ex-Spouse'), ('usaanr', 'usaanr', 'Parent'), ('usaanr', 'usaanr', 'Pers_rel_Other'), ('usaanr', 'usaanr', 'SPONSEE'), ('usaanr', 'usaanr', 'SPONSOR'), ('usaanr', 'usaanr', 'Spouse'), ('usaanr', 'usaanr', 'Step-Child'), ('usaanr', 'usaanr', 'Step-Parent')])

In [7]:
embed=nn.Embedding(G.num_nodes("usaanr")+1, 5)
embed.weight.shape

torch.Size([27114, 5])

In [None]:
idx=test_g.nodes['usaanr'].data[dgl.NID][th.tensor([4857,800842])].squeeze().to(self.device)
idx