In [None]:
# !pip install --quiet --pre --upgrade dgl-cu101
# !pip install --quiet torch==1.6.0

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.nn.pytorch import GATConv
# from bipartite_gatconv import BipartiteGATConv
from collections import defaultdict
from tqdm import tqdm
tqdm().pandas()
import pickle
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(5.0,4.0)
plt.rcParams['image.interpolation']='nearest'
plt.rcParams['image.cmap']='gray'
import warnings
warnings.filterwarnings('ignore')

data_dir="/workspace/cjiang/eagle_project/CAP_graph/dataset/"
root_dir="/workspace/cjiang/eagle_project/CAP_graph/"

os.chdir(root_dir)

print("{:<20}{:<20}".format("torch version",torch.__version__))
print("{:<20}{:<20}".format("DGL version",dgl.__version__))


#### Load Dataset

In [None]:
def read_csv(data_dir, file):
    start=time.time()
    df=pd.read_csv(os.path.join(data_dir,file))
    df.drop_duplicates(inplace=True)
    end=time.time()
    print("Dataloading running time is {:0.4f}".format(end-start))
    print("The Shape of Dataset is {}".format(df.shape))
    return df

def to_pickle(data_dir,file_in,file_out):
    start=time.time()
    file_in.to_pickle(os.path.join(data_dir,file_out))
    end=time.time()
    print("pickle time is {:0.4f}".format(end-start))
    
def read_pickle(data_dir,file):
    start=time.time()
    df=pd.read_pickle(os.path.join(data_dir,file))
    end=time.time()
    print("loading time is {:0.4f}".format(end-start))
    print("The Shape of Dataset is {}".format(df.shape))
    return df

In [None]:
# Pers_Edges=read_csv(data_dir, 'Pers_Edges.csv')
# to_pickle(data_dir,Pers_Edges,'Pers_Edges_pickle')
Pers_Edges=read_pickle(data_dir,'Pers_Edges_pickle')

In [None]:
# Busi_Edges=read_csv(data_dir, 'Busi_Edges.csv')
# to_pickle(data_dir,Busi_Edges,'Busi_Edges_pickle')
Busi_Edges=read_pickle(data_dir,'Busi_Edges_pickle')

In [None]:
# zipcode_Edges=read_csv(data_dir, 'zipcode_Edges.csv')
# to_pickle(data_dir,zipcode_Edges,'zipcode_Edges_pickle')
zipcode_Edges=read_pickle(data_dir,'zipcode_Edges_pickle')

In [None]:
# Product_Edges=read_csv(data_dir, 'Product_Edges.csv')
# to_pickle(data_dir,Product_Edges,'Product_Edges_pickle')
Product_Edges=read_pickle(data_dir,'Product_Edges_pickle')

In [None]:
# vertices=read_csv(data_dir, 'vertices_clean.csv')
# to_pickle(data_dir,vertices,'vertices_pickle')
vertices=read_pickle(data_dir,'vertices_pickle')

#### Transform product edge data and create product bit vector

In [None]:
# Product_Edges=Product_Edges[Product_Edges["dst"].isin(["P_AUTO","P_HOME","P_RENT"])]

In [None]:
# def map_prod(LIST):
#     if (LIST.count("P_AUTO")==1) and (LIST.count("P_HOME")==1) and (LIST.count("P_RENT")==1):
#         return [1,1,1,"111"]
#     elif (LIST.count("P_AUTO")==1) and (LIST.count("P_HOME")==1) and (LIST.count("P_RENT")==0):
#         return [1,1,0,"110"]
#     elif (LIST.count("P_AUTO")==1) and (LIST.count("P_HOME")==0) and (LIST.count("P_RENT")==1):
#         return [1,0,1,"101"]
#     elif (LIST.count("P_AUTO")==0) and (LIST.count("P_HOME")==1) and (LIST.count("P_RENT")==1):
#         return [0,1,1,"011"]
#     elif (LIST.count("P_AUTO")==1) and (LIST.count("P_HOME")==0) and (LIST.count("P_RENT")==0):
#         return [1,0,0,"100"]
#     elif (LIST.count("P_AUTO")==0) and (LIST.count("P_HOME")==1) and (LIST.count("P_RENT")==0):
#         return [0,1,0,"010"]
#     elif (LIST.count("P_AUTO")==0) and (LIST.count("P_HOME")==0) and (LIST.count("P_RENT")==1):
#         return [0,0,1,"001"]
#     elif (LIST.count("P_AUTO")==0) and (LIST.count("P_HOME")==0) and (LIST.count("P_RENT")==0):
#         return [0,0,0,"000"]

In [None]:
# # import numba as nb
# # @nb.jit(nopython=True)
# def Flatten_Product(Product_Edges):
#     Product_Edges.sort_values(by="src",inplace=True)
#     src, _, dst=np.array(Product_Edges).transpose()
#     row=np.unique(src).shape[0]
#     col=np.unique(dst).shape[0]
#     Flatten_df=np.zeros([row,col+2],dtype=int)   ### usaa_number,  P_AUTO,  P_HOME,  P_RENT,  Product_bit_Vector
    
#     uniq_v, uniq_e=np.unique(Product_Edges["src"],return_inverse=True)
#     _, idx=np.unique(uniq_e,return_index=True)

#     for i in tqdm(np.arange(len(uniq_v))):
        
#         if i!=len(uniq_v)-1:
#             Flatten_df[i][0]=list(set(Product_Edges[idx[i]:idx[i+1]].src.astype(int)))[0]
#             Flatten_df[i][1:5]=map_prod(list(set(Product_Edges[idx[i]:idx[i+1]].dst)))
#         else:
#             Flatten_df[i][0]=list(set(Product_Edges[idx[i]:].src.astype(int)))[0]
#             Flatten_df[i][1:5]=map_prod(list(set(Product_Edges[idx[i]:].dst)))
    
#     Flatten_df=pd.DataFrame(Flatten_df,columns=["USAA_Number","P_AUTO","P_HOME","P_RENT","Prod_Trace"])
#     Flatten_df["Prod_Trace"]=Flatten_df["Prod_Trace"].apply(lambda x : '{0:0>3}'.format(x)) ## add the leading zero in some prod_trace
    
#     return Flatten_df

In [None]:
# Flatten_df=Flatten_Product(Product_Edges)

In [None]:
# to_pickle(data_dir,Flatten_df,'Flatten_Product_pickle')
Flatten_Product_pickle=read_pickle(data_dir,'Flatten_Product_pickle')

In [None]:
tempt1=pd.DataFrame(Flatten_Product_pickle["Prod_Trace"].value_counts()).reset_index().rename(columns={'index':'product','Prod_Trace':'count'})
tempt2=pd.DataFrame(Flatten_Product_pickle["Prod_Trace"].value_counts(normalize=True)).reset_index().rename(columns={'index':'product','Prod_Trace':'percentage'})
tempt1.merge(tempt2, on="product", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
print("Pers_Edges Dataframe:")
print(Pers_Edges.head(2))
print()
print("Busi_Edges Dataframe:")
print(Busi_Edges.head(2))
print()
# print("zipcode_Edges Dataframe:")
# print(zipcode_Edges.head(2))
# print()
print("Product_Edges Dataframe:")
print(Flatten_Product_pickle.head(2))

#### Create bi-directional relationship

In [None]:
def datashow(dataframe):
    tempt1=pd.DataFrame(dataframe.rel.value_counts()).reset_index().rename(columns={'index':'rels','rel':'count'})
    tempt2=pd.DataFrame(dataframe.rel.value_counts(normalize=True)).reset_index().rename(columns={'index':'rels','rel':'percentage'})
    return tempt1.merge(tempt2, on="rels", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'})

def make_bidirectional(dataframe, rel_types, rev_rel_types):
    """
    dataframe :    triplet(src, rel, dst)
    rel_types:     relationship to be inversed bidirectiohally
    rev_rel_types: the inversed relationship name
    """
    np_df=np.array(dataframe)
    src,rel,dst=np_df.transpose()
    rel_v2=rel.copy()
    for idx, val in enumerate(rel_types):
        rel_v2[rel==val]=rev_rel_types[idx]
        
    src,dst=np.concatenate((src,dst)), np.concatenate((dst, src))
    rel=np.concatenate((rel, rel_v2))
    
    DF=pd.DataFrame(sorted(zip(src, rel, dst)),columns=['src','rel','dst'])
    DF.drop_duplicates(inplace=True)
    
    return DF

In [None]:
datashow(Busi_Edges)

In [None]:
rel_types=Busi_Edges.rel.unique().tolist()  ## ['Busi_rel_Other', 'SPONSOR', 'AUTO_RELATED']
rev_rel_types=['Busi_rel_Other','SPONSEE','AUTO_RELATED']
for idx, val in enumerate(rel_types):
    print("{:<30}{:<30}".format(val, rev_rel_types[idx]))

In [None]:
%%time
rel_types=Busi_Edges.rel.unique().tolist()  ## ['Busi_rel_Other', 'SPONSOR', 'AUTO_RELATED']
rev_rel_types=['Busi_rel_Other','SPONSEE','AUTO_RELATED']
Busi_Edges_bi=make_bidirectional(Busi_Edges, rel_types, rev_rel_types)

In [None]:
datashow(Busi_Edges_bi)

In [None]:
# datashow(zipcode_Edges)

In [None]:
# zipcode_Edges['dst'].unique().shape[0]

In [None]:
# %%time
# rel_types=zipcode_Edges.rel.unique().tolist()  ## ['Located_In']
# rev_rel_types=["Location_of"]
# zipcode_Edges_bi=make_bidirectional(zipcode_Edges, rel_types, rev_rel_types)

In [None]:
# datashow(zipcode_Edges_bi)

In [None]:
Pers_Edges=Pers_Edges.replace(to_replace ="Step-Parent/Mother",value ="Step-Parent")
datashow(Pers_Edges)

In [None]:
rel_types=Pers_Edges.rel.unique().tolist()  
rev_rel_types=['Child','Spouse','Parent','Ex-Spouse','Pers_rel_Other','Brother_Sister','Step-Child','Step-Parent']
for idx, val in enumerate(rel_types):
    print("{:<30}{:<30}".format(val, rev_rel_types[idx]))

In [None]:
%%time
Pers_Edges_bi=make_bidirectional(Pers_Edges, rel_types, rev_rel_types)

In [None]:
datashow(Pers_Edges_bi)

#### Construct a Heterogeneous Graph

#### re-label source nodes and destination nodes for each node type

In [None]:
%%time
#### relabel the nodes of USAA Member such that they are continous integers from 0 to max
src_pers, _ ,dst_pers=np.array(Pers_Edges_bi).transpose()
src_busi, _ ,dst_busi=np.array(Busi_Edges_bi).transpose()
# src_zipcode, _ ,dst_zipcode=np.array(zipcode_Edges_bi[zipcode_Edges_bi['rel']=="Located_In"]).transpose()

all_usaanr=np.concatenate((src_pers,dst_pers,src_busi,dst_busi))
uniq_usaanr = np.unique(all_usaanr)
# uniq_zipcode = np.unique(dst_zipcode)

vertices=vertices[vertices['usaanr'].isin(uniq_usaanr)]

uniq_usaanr.sort()
# uniq_zipcode.sort()
usaanr_map = {id:idx for idx, id in enumerate(uniq_usaanr)}
# zipcode_map = {id:idx for idx, id in enumerate(uniq_zipcode)}

In [None]:
Pers_Edges_bi['src'] = list(map(usaanr_map.get, Pers_Edges_bi['src']))
Pers_Edges_bi['dst'] = list(map(usaanr_map.get, Pers_Edges_bi['dst']))
print(len(usaanr_map))
print(Pers_Edges_bi['src'].unique().max())
print(Pers_Edges_bi['dst'].unique().max())

In [None]:
Busi_Edges_bi['src'] = list(map(usaanr_map.get, Busi_Edges_bi['src']))
Busi_Edges_bi['dst'] = list(map(usaanr_map.get, Busi_Edges_bi['dst']))
print(len(usaanr_map))
print(Busi_Edges_bi['src'].unique().max())
print(Busi_Edges_bi['dst'].unique().max())

In [None]:
# zipcode_Edges_v1=zipcode_Edges_bi[zipcode_Edges_bi['rel']=="Located_In"]
# zipcode_Edges_v1['src'] = list(map(usaanr_map.get, zipcode_Edges_v1['src']))
# zipcode_Edges_v1['dst'] = list(map(zipcode_map.get, zipcode_Edges_v1['dst']))

# zipcode_Edges_v2=zipcode_Edges_bi[zipcode_Edges_bi['rel']=="Location_of"]
# zipcode_Edges_v2['src'] = list(map(zipcode_map.get, zipcode_Edges_v2['src']))
# zipcode_Edges_v2['dst'] = list(map(usaanr_map.get, zipcode_Edges_v2['dst']))

# zipcode_Edges_bi=zipcode_Edges_v1.append(zipcode_Edges_v2,ignore_index=True)
# print(len(usaanr_map))
# print(zipcode_Edges_bi['src'].unique().max())
# print(len(zipcode_map))
# print(zipcode_Edges_bi['dst'].unique().max())

#### Re-label Edges types

In [None]:
# relation_encoder=LabelEncoder()
# relation_encoder.fit(pd.concat([Pers_Edges['rel'],Busi_Edges['rel'],zipcode_Edges['rel']]))

In [None]:
%%time
pers_rel_nums = {"rel":     {"Parent": 0, "Child": 1, "Spouse": 2,"Ex-Spouse": 3,"Brother_Sister": 4, "Step-Parent": 5, "Step-Child": 6, "Pers_rel_Other": 7}}
busi_rel_nums = {"rel":     {"SPONSOR": 8, "SPONSEE": 9, "AUTO_RELATED": 10,"Busi_rel_Other":11}}
# zipcode_rel_nums = {"rel":     {"Located_In": 12,"Location_of": 13}}

# start=time.time()
Pers_Edges_bi.replace(pers_rel_nums, inplace=True)
Busi_Edges_bi.replace(busi_rel_nums, inplace=True)
# zipcode_Edges_bi.replace(zipcode_rel_nums, inplace=True)

# end=time.time()
# print("running time is {:0.4f}".format(end-start))

In [None]:
print(Pers_Edges_bi['rel'].unique())
print(Busi_Edges_bi['rel'].unique())
# print(zipcode_Edges_bi['rel'].unique())

In [None]:
data_dict=dict()

### Personal Relationship ###
rel=np.array(Pers_Edges_bi['rel'])
src_pers=np.array(Pers_Edges_bi['src'])
dst_pers=np.array(Pers_Edges_bi['dst'])

data_dict.update({('usaanr', 'Parent', 'usaanr')  :         (src_pers[rel==0], dst_pers[rel==0])})
data_dict.update({('usaanr', 'Child', 'usaanr')   :         (src_pers[rel==1], dst_pers[rel==1])})
data_dict.update({('usaanr', 'Spouse', 'usaanr')  :         (src_pers[rel==2], dst_pers[rel==2])})
data_dict.update({('usaanr', 'Ex-Spouse', 'usaanr')  :      (src_pers[rel==3], dst_pers[rel==3])})
data_dict.update({('usaanr', 'Brother_Sister', 'usaanr')  : (src_pers[rel==4], dst_pers[rel==4])})
data_dict.update({('usaanr', 'Step-Parent', 'usaanr')     : (src_pers[rel==5], dst_pers[rel==5])})
data_dict.update({('usaanr', 'Step-Child', 'usaanr')    :   (src_pers[rel==6], dst_pers[rel==6])})
data_dict.update({('usaanr', 'Pers_rel_Other', 'usaanr')  : (src_pers[rel==7], dst_pers[rel==7])})

### Business Relationship ###
rel=np.array(Busi_Edges_bi['rel'])
src_busi=np.array(Busi_Edges_bi['src'])
dst_busi=np.array(Busi_Edges_bi['dst'])

data_dict.update({('usaanr', 'SPONSOR', 'usaanr')    :      (src_busi[rel==8], dst_busi[rel==8])})
data_dict.update({('usaanr', 'SPONSEE', 'usaanr')    :      (src_busi[rel==9], dst_busi[rel==9])})
data_dict.update({('usaanr', 'AUTO_RELATED', 'usaanr')   :  (src_busi[rel==10], dst_busi[rel==10])})
data_dict.update({('usaanr', 'Busi_rel_Other', 'usaanr') :  (src_busi[rel==11], dst_busi[rel==11])})

# ### zipcode relationship ###
# zipcode_Edges_v1=zipcode_Edges_bi[zipcode_Edges_bi['rel']==12]
# src_v1=np.array(zipcode_Edges_v1['src'])
# dst_v1=np.array(zipcode_Edges_v1['dst'])

# zipcode_Edges_v2=zipcode_Edges_bi[zipcode_Edges_bi['rel']==13]
# src_v2=np.array(zipcode_Edges_v2['src'])
# dst_v2=np.array(zipcode_Edges_v2['dst'])

# data_dict.update({('usaanr', 'Located_In', 'zipcode')  :   (src_v1, dst_v1)})
# data_dict.update({('zipcode', 'Location_of','usaanr')  :   (src_v2, dst_v2)})

In [None]:
start=time.time()
g = dgl.heterograph(data_dict)
end=time.time()
print("running time is {:0.4f}".format(end-start))

#### Assign Edges Types

In [None]:
g.edges['Parent'].data["etype"]=torch.zeros(g.num_edges("Parent"))
g.edges['Child'].data["etype"]=torch.ones(g.num_edges("Child"))
g.edges['Spouse'].data["etype"]=torch.ones(g.num_edges("Spouse"))*2
g.edges['Ex-Spouse'].data["etype"]=torch.ones(g.num_edges("Ex-Spouse"))*3
g.edges['Brother_Sister'].data["etype"]=torch.ones(g.num_edges("Brother_Sister"))*4
g.edges['Step-Parent'].data["etype"]=torch.ones(g.num_edges("Step-Parent"))*5
g.edges['Step-Child'].data["etype"]=torch.ones(g.num_edges("Step-Child"))*6
g.edges['Pers_rel_Other'].data["etype"]=torch.ones(g.num_edges("Pers_rel_Other"))*7
g.edges['SPONSOR'].data["etype"]=torch.ones(g.num_edges("SPONSOR"))*8
g.edges['SPONSEE'].data["etype"]=torch.ones(g.num_edges("SPONSEE"))*9
g.edges['AUTO_RELATED'].data["etype"]=torch.ones(g.num_edges("AUTO_RELATED"))*10
g.edges['Busi_rel_Other'].data["etype"]=torch.ones(g.num_edges("Busi_rel_Other"))*11

In [None]:
def graph_show(G):
    print('*'*50)
    print("Node_types: " , G.ntypes)
    print("Edge_types: " , G.etypes)
    print('*'*50)
    print("Canonical Etypes of Graph is:\n")
    for srctype, etype, dsttype in G.canonical_etypes:
        print("{:<20}{:<20}{:<20}".format(srctype, etype, dsttype))
    print('*'*50)
    Total_ntype_num=0
    for i in G.ntypes:
        print(f"number of ntype={i:<20}  {G.number_of_nodes(i):<15,}")
        Total_ntype_num+=G.number_of_nodes(i)
    print('*'*50)
    print("Total number of nodes is {:,}".format(Total_ntype_num))
    print('*'*50)
    Total_edge_num=0
    for j in G.etypes:
        print(f"number of etype={j:<20}  {G.number_of_edges(j):<15,}")
        Total_edge_num+=G.number_of_edges(j)
    print('*'*50)
    print("Total number of edges is {:,}".format(Total_edge_num))
    print('*'*50)
    for nty in G.ntypes:
        if G.nodes[nty].data!={}:
            print('*'*50)
            print(f"The attributes for the node type={nty}")
            print('*'*50)
            for key, scheme in G.node_attr_schemes(ntype=nty).items():
                print("{:<40}{}".format(key,G.nodes[nty].data[key].shape))

In [None]:
graph_show(g)

#### Adding features/attributes to the nodes of USAA Members
The CAP dataset has some features for USAA Member.

* usaayr:  USAA Number Issue Year
* AGE / AGE BUCKET
* ORIGEL :  Original Eligibility
* ELIG2 : Current Eligibility
* cmpyelig: Company Eligibility
* Segment: Alpha ~ Juliet
* SEX
* MARST : Marriage Status
* MILST : Military Status
* MLIST_OrigStat: Original Military Status
* ENLPAYGD: Military Pay Grade
* BRANCH: Military BRANCH of Service
* ACTCORP : Corporate Active Status
* STATE

We use label encoding for all categorial variables. <br>
In addition, there is a node data "type" that indicates the node type of usaa member, zipcode in the heterogenous graph. <br>
The nodes of zipcode don't have the same features as the node of member. 

In [None]:
vertices.columns

In [None]:
# assert g.number_of_nodes(ntype="usaanr")==vertices.shape[0], "the shape of feature data is not equal to the number of USAA member"

#### sort the vertices dataframe based on the order of nodes in graph

In [None]:
if g.number_of_nodes(ntype="usaanr")==vertices.shape[0]:
    vertices_v2=vertices
else:
    vertices_v2=vertices[vertices['usaanr'].isin(all_usaanr)]
print("{:<20} {:<15,}".format("size of original vertices",vertices.shape[0]))
print("{:<20} {:<15,}".format("size of updated vertices",vertices_v2.shape[0]))

In [None]:
vertices_v2['usaanr'] = list(map(usaanr_map.get, vertices_v2['usaanr']))
vertices_v2.sort_values(by=["usaanr"],inplace=True)

to_pickle(data_dir,vertices_v2,'vertices_reindex_pickle')
vertices=read_pickle(data_dir,'vertices_reindex_pickle')

#### Create node type feature
The reason I create node type feature is to embedding the node USAA Member , product and zipcode separately. Unlike the node of usaanr,  the nodes of zipcode don't have other features except for node type

In [None]:
#### Nodes type feature
# g.nodes['usaanr'].data['type'] = torch.zeros(size=[g.number_of_nodes(ntype='usaanr'),1]).long()
# g.nodes['zipcode'].data['type'] = torch.ones(size=[g.number_of_nodes(ntype='zipcode'),1]).long()

In [None]:
#### Bin the numerical variable
def Bin_Numerical(args,b):
#     if args==0:
#         return str(0)
    for i in range(len(b)-1):
        if args>=b[i] and args<=b[i+1]:
#             return str((int(b[i]), int(b[i+1])))
            return int(i)

In [None]:
feat=['usaanr','usaayr','AGE_BAND','ORIGEL', 'ELIG2', 'cmpyelig','SEX', 'MARST','BRANCH','ENLPAYGD','MILST',
       'MLIST_OrigStat','ACTCORP', 'STATE', 'Segment']
vertices_feat=vertices.loc[:,feat]

for col in vertices_feat:
    if col !='usaanr':
        vertices_feat[col]=vertices_feat[col].astype('str')
    
class_le=LabelEncoder()

for col in vertices_feat.columns:
    if vertices_feat[col].dtype=="object" and col !='usaanr':
        vertices_feat[col]=vertices_feat[col].astype('str')
        vertices_feat[col]=class_le.fit_transform(vertices_feat[col])

In [None]:
for col in vertices_feat.columns:
    g.nodes['usaanr'].data[col]= torch.tensor( np.expand_dims(np.array(vertices_feat[col]), 1) )

#### Create label
There are two kinds of labels.

* multi-classification label:     Auto, Home and Rental
* binary-classification label:    Auto or Non-Auto

In [None]:
Flatten_Product_pickle.head(2)

In [None]:
prod_df=Flatten_Product_pickle[Flatten_Product_pickle["USAA_Number"].isin(all_usaanr)]
prod_df=prod_df.loc[:,["USAA_Number","P_AUTO","Prod_Trace"]].rename(columns={"USAA_Number":"usaanr"})
prod_df['usaanr'] = list(map(usaanr_map.get, prod_df['usaanr']))
prod_df['usaanr'].unique().max()  ### usaanr is not indexed consecutively

In [None]:
vertices=read_pickle(data_dir,'vertices_reindex_pickle')

In [None]:
vertices.usaanr.dtypes, prod_df.usaanr.dtypes

In [None]:
vertices["usaanr"]=vertices["usaanr"].astype(str)
prod_df["usaanr"]=prod_df["usaanr"].astype(str)
vertices_v1=vertices.merge(prod_df, on='usaanr', how="left").loc[:,['usaanr','P_AUTO','Prod_Trace']]
vertices_v1["Prod_Trace"]=vertices_v1["Prod_Trace"].astype(str)
vertices_v1["P_AUTO"]=vertices_v1["P_AUTO"].astype(str)                     

In [None]:
np.unique(vertices_v1["P_AUTO"],return_counts=True), np.unique(vertices_v1["Prod_Trace"],return_counts=True)

In [None]:
# 1 : customers own Auto,  0: customers owns some products but not Auto, 2: customers didn't own any product
rep_nums = {"P_AUTO":     {"1.0": "1", "0.0": "0", "nan": "2"}}  
vertices_v1.replace(rep_nums, inplace=True)

rep_nums = {"Prod_Trace":     {"nan": "000"}}  
vertices_v1.replace(rep_nums, inplace=True)

# vertices_v1.sort_values(by=["usaanr"],inplace=True)  ### will the sort change the order of node in graph 

In [None]:
tempt1=pd.DataFrame(vertices_v1["Prod_Trace"].value_counts()).reset_index().rename(columns={'index':'product','Prod_Trace':'count'})
tempt2=pd.DataFrame(vertices_v1["Prod_Trace"].value_counts(normalize=True)).reset_index().rename(columns={'index':'product','Prod_Trace':'percentage'})
tempt3=tempt1.merge(tempt2, on="product", how="inner")
tempt3["product_type"]=["No Product", "Auto+Home","Auto Only","Auto+Rental","Rental Only","Home Only","Auto+Home+Rental","Home+Rental"]
tempt3=tempt3[["product","product_type","count","percentage"]]
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
### due to some rare category,  merge "Auto + Home + Rental" to "Auto + Home", merge "Home + Rental" to "Home Only"
rep_nums = {"Prod_Trace":     {"111": "110",  "011":"010"}}  
vertices_v1.replace(rep_nums, inplace=True)
tempt1=pd.DataFrame(vertices_v1["Prod_Trace"].value_counts()).reset_index().rename(columns={'index':'product','Prod_Trace':'count'})
tempt2=pd.DataFrame(vertices_v1["Prod_Trace"].value_counts(normalize=True)).reset_index().rename(columns={'index':'product','Prod_Trace':'percentage'})
tempt1.merge(tempt2, on="product", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
Prod_Trace = vertices_v1["Prod_Trace"].unique()
Prod_Trace.sort()
product_map = {id:idx for idx, id in enumerate(Prod_Trace)}
product_map

In [None]:
vertices_v1['Prod_Trace_map'] = list(map(product_map.get, vertices_v1['Prod_Trace']))
multi_label=torch.tensor(np.expand_dims(np.array(vertices_v1.loc[:,"Prod_Trace_map"]),1))
torch.unique(multi_label.squeeze(),return_counts=True)

In [None]:
tempt1=pd.DataFrame(vertices_v1["P_AUTO"].value_counts()).reset_index().rename(columns={'index':'product','P_AUTO':'count'})
tempt2=pd.DataFrame(vertices_v1["P_AUTO"].value_counts(normalize=True)).reset_index().rename(columns={'index':'product','P_AUTO':'percentage'})
tempt3=tempt1.merge(tempt2, on="product", how="inner")
tempt3["product_type"]=["No Product", "Auto only","Other Products but not Auto"]
tempt3=tempt3[["product","product_type","count","percentage"]]
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
### combine the category of 0 and 2 because the category of 0 is too rare
rep_nums = {"P_AUTO":     {"2": "0"}}  
vertices_v1.replace(rep_nums, inplace=True)
tempt1=pd.DataFrame(vertices_v1["P_AUTO"].value_counts()).reset_index().rename(columns={'index':'product','P_AUTO':'count'})
tempt2=pd.DataFrame(vertices_v1["P_AUTO"].value_counts(normalize=True)).reset_index().rename(columns={'index':'product','P_AUTO':'percentage'})
tempt3=tempt1.merge(tempt2, on="product", how="inner")
tempt3["product_type"]=["No Auto","Auto Only"]
tempt3=tempt3[["product","product_type","count","percentage"]]
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
vertices_v1["P_AUTO"]=vertices_v1["P_AUTO"].astype(int)
binary_label=torch.tensor(np.expand_dims(np.array(vertices_v1.loc[:,"P_AUTO"]),1))
torch.unique(binary_label.squeeze(),return_counts=True)

#### Create training(80%), validation(10%) and test(10%) mask based on each category of product

In [None]:
def mask_func(train_idx,all_idx):
    train_idx=pd.DataFrame({"idx":train_idx})
    all_idx=pd.DataFrame({"idx":all_idx})
    all_index = all_idx.set_index(['idx']).index
    train_index = train_idx.set_index(['idx']).index
    mask = all_index.isin(train_index)
    return mask

def mask_creation(G,node_labels):
    
    train_idx=[]
    val_idx=[]
    test_idx=[]
    
    LABEL=node_labels.numpy().squeeze()
    IDX=np.arange(LABEL.shape[0])
    prod_list=np.unique(LABEL).tolist()
    
    for i in tqdm(range(len(prod_list)),position=0,leave=True):
        _idx=IDX[LABEL==prod_list[i]]
        np.random.seed(101)
        np.random.shuffle(_idx)
        test_idx.extend(_idx[:len(_idx)//10])
        val_idx.extend(_idx[len(_idx)//10 : len(_idx)//5])
        train_idx.extend(_idx[len(_idx)//5:])

    all_idx=np.arange(G.number_of_nodes(ntype="usaanr"))

    test_idx=np.array(test_idx)
    val_idx=np.array(val_idx)
    train_idx=np.array(train_idx)

    train_mask=mask_func(train_idx,all_idx)
    val_mask=mask_func(val_idx,all_idx)
    test_mask=mask_func(test_idx,all_idx)

    train_mask=torch.tensor(train_mask,dtype=bool)
    val_mask=torch.tensor(val_mask,dtype=bool)
    test_mask=torch.tensor(test_mask,dtype=bool)
    
    return train_mask, val_mask, test_mask

In [None]:
train_mask_multi_label, val_mask_multi_label, test_mask_multi_label=mask_creation(g,multi_label)
print("{:<30}{:<10,}".format("dimension of training mask", torch.sum(train_mask_multi_label).item()))
print("{:<30}{:<10,}".format("dimension of val mask", torch.sum(val_mask_multi_label).item()))
print("{:<30}{:<10,}".format("dimension of test mask", torch.sum(test_mask_multi_label).item()))

In [None]:
train_mask_binary_label, val_mask_binary_label, test_mask_binary_label=mask_creation(g,binary_label)
print("{:<30}{:<10,}".format("dimension of training mask", torch.sum(train_mask_binary_label).item()))
print("{:<30}{:<10,}".format("dimension of val mask", torch.sum(val_mask_binary_label).item()))
print("{:<30}{:<10,}".format("dimension of test mask", torch.sum(test_mask_binary_label).item()))

#### Training Set

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(train_mask_binary_label.numpy())]['P_AUTO'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'count'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(train_mask_binary_label.numpy())]['P_AUTO'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'percentage'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt3=tempt1.merge(tempt2, on="Auto or Not", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(train_mask_multi_label.numpy())]['Prod_Trace'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'count'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(train_mask_multi_label.numpy())]['Prod_Trace'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'percentage'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt3=tempt1.merge(tempt2, on="Product_Type", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

#### Validation Set

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(val_mask_binary_label.numpy())]['P_AUTO'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'count'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(val_mask_binary_label.numpy())]['P_AUTO'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'percentage'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt3=tempt1.merge(tempt2, on="Auto or Not", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(val_mask_multi_label.numpy())]['Prod_Trace'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'count'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(val_mask_multi_label.numpy())]['Prod_Trace'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'percentage'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt3=tempt1.merge(tempt2, on="Product_Type", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

#### Test Set

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(test_mask_binary_label.numpy())]['P_AUTO'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'count'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(test_mask_binary_label.numpy())]['P_AUTO'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Auto or Not','P_AUTO':'percentage'}).replace({'Auto or Not':     {0: "No",1:"Yes"}})
tempt3=tempt1.merge(tempt2, on="Auto or Not", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
tempt1=pd.DataFrame(vertices_v1.iloc[np.where(test_mask_multi_label.numpy())]['Prod_Trace'].value_counts(dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'count'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt2=pd.DataFrame(vertices_v1.iloc[np.where(test_mask_multi_label.numpy())]['Prod_Trace'].value_counts(normalize=True,dropna=False)).reset_index().\
rename(columns={'index':'Product_Type','Prod_Trace':'percentage'}).replace({'Product_Type':     {'000': "No Product", '001': "Rental Only", '010': "Home Only", '100': "Auto Only", '101': "Auto + Rental", '110': "Auto + Home"}})
tempt3=tempt1.merge(tempt2, on="Product_Type", how="inner")
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

#### Save Graph

In [None]:
data_dir="/workspace/cjiang/eagle_project/CAP_graph/BGNN/"
start=time.time()
with open(os.path.join(data_dir,"CAP_Graph_v1"),"wb") as f:
    pickle.dump((g,multi_label,binary_label,\
                 train_mask_multi_label,  val_mask_multi_label,  test_mask_multi_label,\
                 train_mask_binary_label, val_mask_binary_label, test_mask_binary_label),f)
end=time.time()
print("It took {:0.4f} seconds to save graph database".format(end-start))

In [None]:
start=time.time()
with open(os.path.join(data_dir,"CAP_Graph_v1"),"rb") as f:
    G,multi_label,binary_label,\
    train_mask_multi_label,  val_mask_multi_label,  test_mask_multi_label,\
    train_mask_binary_label, val_mask_binary_label, test_mask_binary_label=pickle.load(f)
end=time.time()
print("It took {:0.4f} seconds to load graph database".format(end-start))

In [None]:
graph_show(G)

#### count the number of nodes whose in-degree ==0

In [None]:
# C=torch.empty(G.num_nodes('usaanr'))
# for etype in G.etypes:
#     C+=G.in_degrees(etype=etype)
# print("{:<35}{:<10,} ".format("The number of zero in-degree nodes is ",torch.sum(C==0)))

#### Find metapaths in the generated heterogeneous graph

In [None]:
G.metagraph

In [None]:
import itertools

def get_all_possible_metapaths(g, K):
    possible_metapaths = []
    metagraph = g.metagraph
    # Iterate over all possible K-length sequences of all canonical edge types
    for metapath in itertools.product(g.canonical_etypes, repeat=K):
        # Check if the sequence indeed connects and forms a metapath.
        # If the destination node type of an edge type is not the same as the source
        # node type of the next edge type, then the sequence is not a valid metapath.
        is_valid = True
        previous_ntype = None
        for srctype, etype, dsttype in metapath:
            if previous_ntype is not None and srctype != previous_ntype:
                is_valid = False
                break
            previous_ntype = dsttype

        if is_valid:
            possible_metapaths.append(metapath)
    filtered_metapaths = []
    for metapath in possible_metapaths:
        result_g = dgl.metapath_reachable_graph(g, metapath)
        if result_g.number_of_edges() > 0:
            filtered_metapaths.append(metapath)
    return filtered_metapaths

def pretty_print_metapath(metapath):
    # This function just pretty-prints the metapath
    item_list = sum([['(' + etype[1] + ')', etype[2]] for etype in metapath], [])
    item_list.insert(0, metapath[0][0])
    return ' -- '.join(item_list)

In [None]:
for K in range(1, 3):
    print('### Length', K, 'metapaths ###')
    possible_metapaths = get_all_possible_metapaths(G, K)
    # print the metapaths
    for metapath in possible_metapaths:
        print(pretty_print_metapath(metapath))

In [None]:
print("Total number of metapath(length 2) is {}".format(len(possible_metapaths)))