In [1]:
import argparse
import itertools
import os
import numpy as np
from numpy import save,load,savetxt,loadtxt,savez_compressed
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc as auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder, label_binarize

import pandas as pd
import scipy.sparse as sp
import time
from tqdm import tqdm, tqdm_notebook,tnrange
tqdm.pandas(position=0, leave=True)
import math 
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl import edge_subgraph
import dgl.nn as dglnn
import dgl.function as fn

import functools
import seaborn as sns
import pickle
import random
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(5.0,4.0)
plt.rcParams['image.interpolation']='nearest'
plt.rcParams['image.cmap']='gray'
import warnings
warnings.filterwarnings('ignore')
import utils
import tsne_func
print("torch version is {}".format(th.__version__))
print("DGL version is {}".format(dgl.__version__))

Using backend: pytorch


torch version is 1.6.0
DGL version is 0.6.0


In [2]:
KG_dir="/home/ubuntu/"

if os.path.isfile("/home/ubuntu/CAP_Graph_New")==False:
    !hdfs dfs -get /dz/dz_6104/disc.db/CAP_Graph_New  ~/CAP_Graph_New

start=time.time()
with open(os.path.join(KG_dir,'CAP_Graph_New'), 'rb') as f:
    hg,multi_label,binary_label,\
    train_mask_multi_label,  val_mask_multi_label,  test_mask_multi_label,\
    train_mask_binary_label, val_mask_binary_label, test_mask_binary_label= pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph".format(end-start))

It took 6.9107 seconds to load graph


In [3]:
hg.nodes['usaanr'].data["LABEL"]= binary_label
hg.nodes['usaanr'].data["train_mask"]= th.tensor( np.expand_dims(np.array(train_mask_binary_label), 1) )
hg.nodes['usaanr'].data["val_mask"]= th.tensor( np.expand_dims(np.array(val_mask_binary_label), 1) )
hg.nodes['usaanr'].data["test_mask"]= th.tensor( np.expand_dims(np.array(test_mask_binary_label), 1) )

In [8]:
hg.edges['Parent'].data["etype"]=th.zeros(hg.num_edges("Parent"))
hg.edges['Child'].data["etype"]=th.ones(hg.num_edges("Child"))
hg.edges['Spouse'].data["etype"]=th.ones(hg.num_edges("Spouse"))*2
hg.edges['Ex-Spouse'].data["etype"]=th.ones(hg.num_edges("Ex-Spouse"))*3
hg.edges['Brother_Sister'].data["etype"]=th.ones(hg.num_edges("Brother_Sister"))*4
hg.edges['Step-Parent'].data["etype"]=th.ones(hg.num_edges("Step-Parent"))*5
hg.edges['Step-Child'].data["etype"]=th.ones(hg.num_edges("Step-Child"))*6
hg.edges['Pers_rel_Other'].data["etype"]=th.ones(hg.num_edges("Pers_rel_Other"))*7
hg.edges['SPONSOR'].data["etype"]=th.ones(hg.num_edges("SPONSOR"))*8
hg.edges['SPONSEE'].data["etype"]=th.ones(hg.num_edges("SPONSEE"))*9
hg.edges['AUTO_RELATED'].data["etype"]=th.ones(hg.num_edges("AUTO_RELATED"))*10
hg.edges['Busi_rel_Other'].data["etype"]=th.ones(hg.num_edges("Busi_rel_Other"))*11

In [9]:
# calculate norm for each edge type and store in edge
for canonical_etype in hg.canonical_etypes:
    u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
    _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
    degrees = count[inverse_index]
    norm = th.ones(eid.shape[0]).float() / degrees.float()
    norm = norm.unsqueeze(1)
    hg.edges[canonical_etype].data['norm'] = norm

In [10]:
usaanr_feat=[]
for key, scheme in hg.node_attr_schemes(ntype="usaanr").items():
    usaanr_feat.append(key)
# usaanr_feat=[x for x in usaanr_feat if x not in ['ZIPCD','AGE','train_mask','val_mask','test_mask']]

# print()
# print("The features associated with USAA Member are\n ")
# for i in usaanr_feat:
#     print(i)

g, ntype_count, etype_count=dgl.to_homogeneous(hg,ndata=usaanr_feat,edata=['norm','etype'],store_type=True,return_count=True)

num_nodes=g.num_nodes()
node_ids=th.arange(num_nodes)
edge_norm=g.edata['norm']
edge_type=g.edata['etype'].long()

g.ndata['ntype']=g.ndata.pop(dgl.NTYPE)

num_rels=g.edata['etype'].unique().max().item()+1

In [11]:
### Remove some features from edges 
_ID=g.edata.pop("_ID")
_TYPE=g.edata.pop("_TYPE")

for key, val in g.edge_attr_schemes().items():
    print(key)

norm
etype


In [12]:
### Remove some features from nodes so that the graph only contain the features used in the model
zipcd=g.ndata.pop("ZIPCD")
AGE=g.ndata.pop("AGE")
train_mask=g.ndata.pop("train_mask")
val_mask=g.ndata.pop("val_mask")
test_mask=g.ndata.pop("test_mask")
_ID=g.ndata.pop("_ID")
ntype=g.ndata.pop("ntype")
LABEL=g.ndata.pop("LABEL")

for key, val in g.node_attr_schemes().items():
    print(key)

usaayr
AGE_BAND
ORIGEL
ELIG2
cmpyelig
SEX
MARST
PERSST
DEATHSDT
BRANCH
MILST
MLIST_OrigStat
enl1stsdt
COMMSDT
ENLPAYGD
ACTCORP
STATE
Segment


#### save graph

In [13]:
data_dir="/home/ubuntu/"

In [14]:
start=time.time()
with open(os.path.join(data_dir,"homo_graph"),"wb") as f:
    pickle.dump((g,LABEL,train_mask,val_mask, test_mask),f)
end=time.time()
print("It took {:0.4f} seconds to save graph database".format(end-start))

It took 32.2126 seconds to save graph database


In [15]:
!hdfs dfs -put -f ~/homo_graph  /dz/dz_6104/disc.db/gnnexplainer/homo_graph

In [16]:
start=time.time()
with open(os.path.join(data_dir,"homo_graph"),"rb") as f:
    g,LABEL,train_mask,val_mask, test_mask=pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph database".format(end-start))

It took 7.0890 seconds to load graph database
