In [1]:
ls

Dockerfile        fd_sl_train_entry_point.py  [0m[01;34mmodel-dir[0m/          [01;35mroc_curve.png[0m
__init__.py       fraud_tabular.ipynb         [01;34moutput-dir[0m/         utils.py
[01;34m__pycache__[0m/      graph_data_preprocessor.py  [01;35mpr_curve.png[0m
data.py           graph_utils.py              [01;34mpreprocessed_data[0m/
estimator_fns.py  [01;34minput_data[0m/                 pytorch_model.py


In [2]:
import os
import sys
import json

os.environ['DGLBACKEND'] = 'pytorch'

import torch as th
import dgl
print('DLG version: {}'.format(dgl.__version__))
import numpy as np
import pandas as pd
import time
import pickle

from sklearn.metrics import confusion_matrix
from estimator_fns import parse_args, get_logger
from graph_utils import get_files, construct_graph
from data import get_features, get_labels, read_masked_nodes, parse_edgelist, read_edges
from utils import get_metrics
from pytorch_model import HeteroRGCN

Using backend: pytorch


DLG version: 0.7.2


In [3]:
data = pd.read_parquet('https://hr-projects-assets-prod.s3.amazonaws.com/g2m06b3p570/e25b2de8258d6972dd6aa7e5106aa8a9/fraud_detection_challenge_data.parquet')

In [6]:
data['label'].unique()

array([0, 1])

## format data

In [17]:
from itertools import combinations
import logging

In [8]:
data.columns

Index(['tid', 'sid', 'mid', 'payment_type', 'label', 'date', 'f0', 'f1', 'f2',
       'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13',
       'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23',
       'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31'],
      dtype='object')

In [15]:
data['DT'] = (data['date'] - data['date'].min()).dt.days

In [34]:
TRANSACTION_ID_COL = 'tid'
LABEL_COL = 'label'
DATE_COL = 'date'
ID_COLS = ['sid','mid']
CATEGORICAL_COLS = ['payment_type']

In [19]:
processed_data = {}

In [35]:
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir', type=str, default='input_data')
    parser.add_argument('--output-dir', type=str, default='preprocessed_data')
    parser.add_argument('--transactions', type=str, default='transaction.csv', help='name of file with transactions')
    parser.add_argument('--identity', type=str, default='identity.csv', help='name of file with identity info')
    parser.add_argument('--id-cols', type=str, default= args_ids, help='comma separated id cols in transactions table')
    parser.add_argument('--cat-cols', type=str, default= cat_cols, help='comma separated categorical cols in transactions')
    parser.add_argument('--train-data-ratio', type=float, default=0.8, help='fraction of data to use in training set')
    parser.add_argument('--construct-homogeneous', action="store_true", default=False,
                        help='use bipartite graphs edgelists to construct homogenous graph edgelist')
    return parser.parse_args()


def get_logger(name):
    logger = logging.getLogger(name)
    log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s'
    logging.basicConfig(format=log_format, level=logging.INFO)
    logger.setLevel(logging.INFO)
    return logger


def load_data(transaction_data: pd.DataFrame, train_data_ratio: float):
    transaction_df = transaction_data
    logging.info("Shape of transaction data is {}".format(transaction_df.shape))
    logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df[LABEL_COL].isnull().sum()))


    # extract out transactions for test/validation
    n_train = int(transaction_df.shape[0]*train_data_ratio)
    test_ids = transaction_df[TRANSACTION_ID_COL].values[n_train:]

    get_fraud_frac = lambda series: 100 * sum(series)/len(series)
    logging.info("Percent fraud for train transactions: {}".format(get_fraud_frac(transaction_df[LABEL_COL][:n_train])))
    logging.info("Percent fraud for test transactions: {}".format(get_fraud_frac(transaction_df[LABEL_COL][n_train:])))
    logging.info("Percent fraud for all transactions: {}".format(get_fraud_frac(transaction_df[LABEL_COL])))


    return transaction_df, test_ids


def get_features_and_labels(transactions_df: pd.DataFrame, transactions_id_cols: list, transactions_cat_cols: list):
    # Get features
    non_feature_cols = [LABEL_COL, DATE_COL] + transactions_id_cols
    feature_cols = [col for col in transactions_df.columns if col not in non_feature_cols]
    logging.info("Categorical columns: {}".format(transactions_cat_cols))
    features = pd.get_dummies(transactions_df[feature_cols], columns=transactions_cat_cols).fillna(0)
    #features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10)
    logging.info("Transformed feature columns: {}".format(list(features.columns)))
    logging.info("Shape of features: {}".format(features.shape))
    processed_data['features'] = features

    # Get labels
    processed_data['labels'] = transactions_df[[TRANSACTION_ID_COL, LABEL_COL]].copy()

def get_relations_and_edgelist(transactions_df: pd.DataFrame, transactions_id_cols: list):
    # Get relations
    edge_types = transactions_id_cols
    logging.info("Found the following distinct relation types: {}".format(edge_types))
    id_cols = [TRANSACTION_ID_COL] + transactions_id_cols
    full_identity_df = transactions_df[id_cols]
    logging.info("Shape of identity columns: {}".format(full_identity_df.shape))

    # extract edges
    edges = {}
    for etype in edge_types:
        edgelist = full_identity_df[[TRANSACTION_ID_COL, etype]].dropna()
        processed_data['relation_{}_edgelist'.format(etype)] = edgelist
        edges[etype] = edgelist
    return edges


def create_homogeneous_edgelist(edges):
    homogeneous_edges = []
    for etype, relations in edges.items():
        for edge_relation, frame in relations.groupby(etype):
            new_edges = [(a, b) for (a, b) in combinations(frame[TRANSACTION_ID_COL].values, 2)
                         if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
            homogeneous_edges.extend(new_edges)

    processed_data['homogeneous_edges'] = homogeneous_edges

In [None]:
logging = get_logger(__name__)

In [36]:
construct_homogeneous = False

transactions = data.drop(columns= ['DT']).copy()
transactions, test_transactions = load_data(transactions,
                                            0.8)
get_features_and_labels(transactions, ID_COLS, CATEGORICAL_COLS)
relational_edges = get_relations_and_edgelist(transactions, ID_COLS)

if construct_homogeneous:
    create_homogeneous_edgelist(relational_edges)

2022-10-30 13:23:32,315 INFO __main__: Shape of transaction data is (45954, 38)
2022-10-30 13:23:32,317 INFO __main__: # Tagged transactions: 45954
2022-10-30 13:23:32,321 INFO __main__: Percent fraud for train transactions: 2.3365884177025813
2022-10-30 13:23:32,323 INFO __main__: Percent fraud for test transactions: 63.3010553802633
2022-10-30 13:23:32,326 INFO __main__: Percent fraud for all transactions: 14.52974713844279
2022-10-30 13:23:32,327 INFO __main__: Categorical columns: ['payment_type']
2022-10-30 13:23:32,351 INFO __main__: Transformed feature columns: ['tid', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'payment_type_a', 'payment_type_b', 'payment_type_c', 'payment_type_d', 'payment_type_e']
2022-10-30 13:23:32,352 INFO __main__: Shape of features: (45954, 38)
2022-10-30 13:23:32,356 INFO __main__: Foun

In [37]:
processed_data['features']

Unnamed: 0,tid,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f27,f28,f29,f30,f31,payment_type_a,payment_type_b,payment_type_c,payment_type_d,payment_type_e
0,0,0.022376,0.070495,0.428682,0.999985,0.999985,0.398457,0.823592,0.497025,0.965457,...,0.009950,0.014925,0.592040,0.139303,0.497512,0,0,0,0,1
1,1,0.024928,0.999985,0.999985,0.999985,0.999985,0.398457,0.999985,0.941257,0.217850,...,0.009950,0.014925,0.592040,0.139303,0.497512,0,0,1,0,0
2,2,0.006173,0.070495,0.428682,0.999985,0.999985,0.398457,0.057230,0.151243,0.841056,...,0.009950,0.014925,0.592040,0.139303,0.497512,0,0,0,0,1
3,3,0.017375,0.070495,0.428682,0.999985,0.999985,0.398457,0.999985,0.930203,0.025402,...,0.009950,0.014925,0.592040,0.139303,0.497512,1,0,0,0,0
4,4,0.009081,0.999985,0.999985,0.999985,0.999985,0.398457,0.159374,0.697188,0.195489,...,0.009950,0.014925,0.592040,0.139303,0.497512,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45949,45949,0.009081,0.695066,0.428682,0.999985,0.999985,0.999985,0.014230,0.119490,0.348379,...,0.457711,0.268657,0.368159,0.303483,0.845771,0,0,0,0,1
45950,45950,0.003175,0.573930,0.999985,0.999985,0.022791,0.398457,0.999985,0.500364,0.223533,...,0.601990,0.402985,0.482587,0.800995,0.164179,0,0,0,0,1
45951,45951,0.009081,0.350026,0.428682,0.999985,0.999985,0.398457,0.999985,0.621975,0.122220,...,0.601990,0.402985,0.482587,0.800995,0.164179,0,0,0,1,0
45952,45952,0.006173,0.070495,0.999985,0.999985,0.999985,0.999985,0.096758,0.298064,0.530217,...,0.781095,0.855721,0.442786,0.447761,0.587065,0,0,0,0,1


## train model

In [4]:
def normalize(feature_matrix):
    mean = th.mean(feature_matrix, axis=0)
    stdev = th.sqrt(th.sum((feature_matrix - mean)**2, axis=0)/feature_matrix.shape[0])
    return mean, stdev, (feature_matrix - mean) / stdev


def train_fg(model, optim, loss, features, labels, train_g, test_g, test_mask,
             device, n_epochs, thresh, compute_metrics=True):
    """
    A full graph verison of RGCN training
    """
    train_mask = th.logical_not(test_mask)
    train_idx = th.nonzero(train_mask, as_tuple=True)[0]
    test_idx = th.nonzero(test_mask, as_tuple=True)[0]

    print("Train label: {}".format(train_mask.sum()))
    print("Test label: {}".format(test_mask.sum()))

    duration = []
    for epoch in range(n_epochs):
        tic = time.time()
        loss_val = 0.

        pred = model(train_g, features.to(device))

        l = loss(th.index_select(pred, 0, train_idx),
                 th.index_select(labels, 0, train_idx))

        optim.zero_grad()
        l.backward()
        optim.step()

        loss_val += l

        duration.append(time.time() - tic)
        metric = evaluate(model, train_g, features, labels, device)
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | f1 {:.4f} ".format(
                epoch, np.mean(duration), loss_val, metric))

    class_preds, pred_proba = get_model_class_predictions(model,
                                                          test_g,
                                                          features,
                                                          labels,
                                                          device,
                                                          threshold=thresh)

    if compute_metrics:
        acc, f1, p, r, roc, pr, ap, cm = get_metrics(class_preds, pred_proba, labels.numpy(), test_mask.numpy(), './')
        print("Metrics")
        print("""Confusion Matrix:
                                {}
                                f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}, roc: {:.4f}, pr: {:.4f}, ap: {:.4f}
                             """.format(cm, f1, p, r, acc, roc, pr, ap))

    return model, class_preds, pred_proba


def get_f1_score(y_true, y_pred):
    """
    Only works for binary case.
    Attention!
    tn, fp, fn, tp = cf_m[0,0],cf_m[0,1],cf_m[1,0],cf_m[1,1]

    :param y_true: A list of labels in 0 or 1: 1 * N
    :param y_pred: A list of labels in 0 or 1: 1 * N
    :return:
    """
    # print(y_true, y_pred)

    cf_m = confusion_matrix(y_true, y_pred)
    # print(cf_m)

    precision = cf_m[1,1] / (cf_m[1,1] + cf_m[0,1] + 10e-5)
    recall = cf_m[1,1] / (cf_m[1,1] + cf_m[1,0])
    f1 = 2 * (precision * recall) / (precision + recall + 10e-5)

    return precision, recall, f1


def evaluate(model, g, features, labels, device):
    "Compute the F1 value in a binary classification case"

    preds = model(g, features.to(device))
    preds = th.argmax(preds, axis=1).numpy()
    precision, recall, f1 = get_f1_score(labels, preds)

    return f1


def get_model_class_predictions(model, g, features, labels, device, threshold=None):
    unnormalized_preds = model(g, features.to(device))
    pred_proba = th.softmax(unnormalized_preds, dim=-1)
    if not threshold:
        return unnormalized_preds.argmax(axis=1).detach().numpy(), pred_proba[:,1].detach().numpy()
    return np.where(pred_proba.detach().numpy() > threshold, 1, 0), pred_proba[:,1].detach().numpy()


def save_model(g, model, model_dir, id_to_node, mean, stdev):

    # Save Pytorch model's parameters to model.pth
    th.save(model.state_dict(), os.path.join(model_dir, 'model.pth'))

    # Save graph's structure information to metadata.pkl for inference codes to initialize RGCN model.
    etype_list = g.canonical_etypes
    ntype_cnt = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
    with open(os.path.join(model_dir, 'metadata.pkl'), 'wb') as f:
        pickle.dump({'etypes': etype_list,
                     'ntype_cnt': ntype_cnt,
                     'feat_mean': mean,
                     'feat_std': stdev}, f)

    # Save original IDs to Node_ids, and trained embedding for non-target node type
    # Covert id_to_node into pandas dataframes
    for ntype, mapping in id_to_node.items():

        # ignore target node
        if ntype == 'target':
            continue

        # retrieve old and node id list
        old_id_list, node_id_list = [], []
        for old_id, node_id in mapping.items():
            old_id_list.append(old_id)
            node_id_list.append(node_id)

        # retrieve embeddings of a node type
        node_feats = model.embed[ntype].detach().numpy()

        # get the number of nodes and the dimension of features
        num_nodes = node_feats.shape[0]
        num_feats = node_feats.shape[1]

        # create id dataframe
        node_ids_df = pd.DataFrame({'~label': [ntype] * num_nodes})
        node_ids_df['~id_tmp'] = old_id_list
        node_ids_df['~id'] = node_ids_df['~id_tmp'].apply(lambda col: f'{ntype}-{col}')
        node_ids_df['node_id'] = node_id_list

        # create feature dataframe columns
        cols = {'val' + str(i + 1): node_feats[:, i] for i in range(num_feats)}
        node_feats_df = pd.DataFrame(cols)
        json_props_df = node_feats_df.apply(lambda row: json.dumps(dict(row), default=str), axis=1).to_frame('props_values:String') 

        # merge id with feature, where feature_df use index
        node_id_feats_df = node_ids_df.merge(json_props_df, left_on='node_id', right_on=json_props_df.index)
        # drop the id_tmp and node_id columns to follow the Grelim format requirements
        node_id_feats_df = node_id_feats_df.drop(['~id_tmp', 'node_id'], axis=1)

        # dump the embeddings to files
        node_id_feats_df.to_csv(os.path.join(model_dir, ntype + '.csv'),
                                index=False, header=True, encoding='utf-8')


def get_model(ntype_dict, etypes, hyperparams, in_feats, n_classes, device):

    model = HeteroRGCN(ntype_dict, etypes, in_feats, hyperparams['n_hidden'], n_classes, hyperparams['n_layers'], in_feats)
    model = model.to(device)

    return model

In [None]:
def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--training-dir', type=str, default= 'preprocessed_data')
    parser.add_argument('--model-dir', type=str, default= 'model-dir')
    parser.add_argument('--output-dir', type=str, default= 'output-dir')
    parser.add_argument('--nodes', type=str, default='features.csv')
    parser.add_argument('--target-ntype', type=str, default='TransactionID')
    parser.add_argument('--edges', type=str, default='relation*')
    parser.add_argument('--labels', type=str, default='tags.csv')
    parser.add_argument('--new-accounts', type=str, default='test.csv')
    parser.add_argument('--compute-metrics', type=lambda x: (str(x).lower() in ['true', '1', 'yes']),
                        default=True, help='compute evaluation metrics after training')
    parser.add_argument('--threshold', type=float, default=0, help='threshold for making predictions, default : argmax')
    parser.add_argument('--num-gpus', type=int, default=0)
    parser.add_argument('--optimizer', type=str, default='adam')
    parser.add_argument('--lr', type=float, default=4e-3)
    parser.add_argument('--n-epochs', type=int, default=10)
    parser.add_argument('--n-hidden', type=int, default=16, help='number of hidden units')
    parser.add_argument('--n-layers', type=int, default=2, help='number of hidden layers')
    parser.add_argument('--weight-decay', type=float, default=5e-4, help='Weight for L2 loss')
    parser.add_argument('--dropout', type=float, default=0.2, help='dropout probability, for gat only features')
    parser.add_argument('--embedding-size', type=int, default=64, help="embedding size for node embedding")

    return parser.parse_args()

In [83]:
data[['tid','sid','mid']] = data[['tid','sid','mid']].astype(int)

In [94]:
data.drop('date',axis=1).to_csv('../fraud-challenge/input_data/transaction.csv',index=False)

In [65]:
def get_features(id_to_node, node_feature_files):
    """

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param node_features: path to file containing node features
    :return: (np.ndarray, list) node feature matrix in order and new nodes not yet in the graph
    """
    indices, features, new_nodes = [], [], []
    max_node = max(id_to_node.values())

    features = node_feature_files.iloc[:,1:].copy()
    
    for node_file in node_feature_files:
        is_1st_line = True
        with open(node_file, "r") as fh:
            for line in fh:
                # hard-coding to ignore the 1st line of header
                if is_1st_line:
                    is_1st_line = False
                    continue
    
                node_feats = line.strip().split(",")
                node_id = node_feats[0]
                feats = np.array(list(map(float, node_feats[1:])))
                features.append(feats)
                if node_id not in id_to_node:
                    max_node += 1
                    id_to_node[node_id] = max_node
                    new_nodes.append(max_node)
    
                indices.append(id_to_node[node_id])

    features = np.array(features).astype('float32')
    features = features[np.argsort(indices), :]
    return features, new_nodes

def loadDF(f):
    try:
        return pd.read_csv(f)
    except pd.errors.EmptyDataError as e:
        return pd.DataFrame()

def get_labels(id_to_node, n_nodes, target_node_type, labels_files, masked_nodes_files, additional_mask_rate=0):
    """

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param n_nodes: number of user nodes in the graph
    :param target_node_type: column name for target node type
    :param labels_path: filepath containing labelled nodes
    :param masked_nodes_path: filepath containing list of nodes to be masked
    :param additional_mask_rate: additional_mask_rate: float for additional masking of nodes with labels during training
    :return: (list, list) train and test mask array
    """
    node_to_id = {v: k for k, v in id_to_node.items()}

    labels_df_from_files = map(loadDF, labels_files)
    user_to_label = pd.concat(labels_df_from_files, ignore_index=True).set_index(target_node_type)

    labels = user_to_label.loc[map(int, pd.Series(node_to_id)[np.arange(n_nodes)].values)].values.flatten()
    masked_nodes = read_masked_nodes(masked_nodes_files)
    train_mask, test_mask = _get_mask(id_to_node, node_to_id, n_nodes, masked_nodes,
                                      additional_mask_rate=additional_mask_rate)
    return labels, train_mask, test_mask


def read_masked_nodes(masked_nodes_files):
    """
    Returns a list of nodes extracted from the path passed in

    :param masked_nodes_path: filepath containing list of nodes to be masked i.e test users
    :return: list
    """
    masked_nodes = []
    for f in masked_nodes_files:
        with open(f, "r") as fh:
            masked_nodes += [line.strip() for line in fh]
    return masked_nodes


def _get_mask(id_to_node, node_to_id, num_nodes, masked_nodes, additional_mask_rate):
    """
    Returns the train and test mask arrays

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param node_to_id: dictionary mapping dgl node idx to node names(id)
    :param num_nodes: number of user/account nodes in the graph
    :param masked_nodes: list of nodes to be masked during training, nodes without labels
    :param additional_mask_rate: float for additional masking of nodes with labels during training
    :return: (list, list) train and test mask array
    """
    train_mask = np.ones(num_nodes)
    test_mask = np.zeros(num_nodes)
    for node_id in masked_nodes:
        train_mask[id_to_node[node_id]] = 0
        test_mask[id_to_node[node_id]] = 1
    if additional_mask_rate and additional_mask_rate < 1:
        unmasked = np.array([idx for idx in range(num_nodes) if node_to_id[idx] not in masked_nodes])
        yet_unmasked = np.random.permutation(unmasked)[:int(additional_mask_rate*num_nodes)]
        train_mask[yet_unmasked] = 0
    return train_mask, test_mask


def _get_node_idx(id_to_node, node_type, node_id, ptr):
    if node_type in id_to_node:
        if node_id in id_to_node[node_type]:
            node_idx = id_to_node[node_type][node_id]
        else:
            id_to_node[node_type][node_id] = ptr
            node_idx = ptr
            ptr += 1
    else:
        id_to_node[node_type] = {}
        id_to_node[node_type][node_id] = ptr
        node_idx = ptr
        ptr += 1

    return node_idx, id_to_node, ptr

def df_to_edgelist(edges: pd.DataFrame, id_to_node, header=False, source_type='user', sink_type='user'):
    """
    Parse an edgelist path file and return the edges as a list of tuple
    :param edges: path to comma separated file containing bipartite edges with header for edgetype
    :param id_to_node: dictionary containing mapping for node names(id) to dgl node indices
    :param header: boolean whether or not the file has a header row
    :param source_type: type of the source node in the edge. defaults to 'user' if no header
    :param sink_type: type of the sink node in the edge. defaults to 'user' if no header.
    :return: (list, dict) a list containing edges of a single relationship type as tuples and updated id_to_node dict.
    """
    edge_list = []
    rev_edge_list = []
    source_pointer, sink_pointer = 0, 0
    cols = edges.columns
    
    if header:
        source_type, sink_type = cols[0], cols[1]
        if source_type in id_to_node:
            source_pointer = max(id_to_node[source_type].values()) + 1
        if sink_type in id_to_node:
            sink_pointer = max(id_to_node[sink_type].values()) + 1
        
    for idx, row in edges.iterrows():
        source, sink = row[cols[0]], row[cols[1]]
            
        source_node, id_to_node, source_pointer = _get_node_idx(id_to_node, source_type, source, source_pointer)
        if source_type == sink_type:
            sink_node, id_to_node, source_pointer = _get_node_idx(id_to_node, sink_type, sink, source_pointer)
        else:
            sink_node, id_to_node, sink_pointer = _get_node_idx(id_to_node, sink_type, sink, sink_pointer)

        edge_list.append((source_node, sink_node))
        rev_edge_list.append((sink_node, source_node))

    return edge_list, rev_edge_list, id_to_node, source_type, sink_type

In [66]:
def construct_graph(edges, nodes, target_node_type):

    print("Getting relation graphs from the following edge lists : {} ".format(edges))
    edgelists, id_to_node = {}, {}
    for i, edge in enumerate(edges):
        edgelist, rev_edgelist, id_to_node, src, dst = df_to_edgelist(edge, id_to_node, header=True)
        if src == target_node_type:
            src = 'target'
        if dst == target_node_type:
            dst = 'target'

        if src == 'target' and dst == 'target':
            print("Will add self loop for target later......")
        else:
            if (src, src + '<>' + dst, dst) in edgelists:
                edgelists[(src, src + '<>' + dst, dst)] = edgelists[(src, src + '<>' + dst, dst)] + edgelist
                edgelists[(dst, dst + '<>' + src, src)] = edgelists[(dst, dst + '<>' + src, src)] +rev_edgelist
                print("Append edges for {} from edgelist: {}".format(src + '<>' + dst, edge))
            else:
                edgelists[(src, src + '<>' + dst, dst)] = edgelist
                edgelists[(dst, dst + '<>' + src, src)] = rev_edgelist
                print("Read edges for {} from edgelist: {}".format(src + '<>' + dst, edge))

    # get features for target nodes
    features, new_nodes = get_features(id_to_node[target_node_type], nodes)
    print("Read in features for target nodes")

    # add self relation
    edgelists[('target', 'self_relation', 'target')] = [(t, t) for t in id_to_node[target_node_type].values()]

    g = dgl.heterograph(edgelists)
    print(
        "Constructed heterograph with the following metagraph structure: Node types {}, Edge types{}".format(
            g.ntypes, g.canonical_etypes))
    print("Number of nodes of type target : {}".format(g.number_of_nodes('target')))

    g.nodes['target'].data['features'] = th.from_numpy(features)

    target_id_to_node = id_to_node[target_node_type]
    id_to_node['target'] = target_id_to_node

    del id_to_node[target_node_type]

    return g, features, target_id_to_node, id_to_node

In [None]:
processed_data

In [71]:
[name for name in processed_data.keys() if 'relation' in name]

['relation_sid_edgelist', 'relation_mid_edgelist']

In [75]:
edges_data = [processed_data[name].astype(int) for name in processed_data.keys() if 'relation' in name]

In [None]:
logging = get_logger(__name__)

print('numpy version:{} PyTorch version:{} DGL version:{}'.format(np.__version__,
                                                                    th.__version__,
                                                                    dgl.__version__))

edges_data = [processed_data[name] for name in processed_data.keys() if 'relation' in name] 

g, features, target_id_to_node, id_to_node = construct_graph( edges_data,
                                                                ,
                                                                LABEL_COL)

mean, stdev, features = normalize(th.from_numpy(features))

print('feature mean shape:{}, std shape:{}'.format(mean.shape, stdev.shape))

g.nodes['target'].data['features'] = features

print("Getting labels")
n_nodes = g.number_of_nodes('target')

labels, _, test_mask = get_labels(target_id_to_node,
                                            n_nodes,
                                            args.target_ntype,
                                            get_files(args.labels, args.training_dir),
                                            get_files(args.new_accounts, args.training_dir))
print("Got labels")

labels = th.from_numpy(labels).float()
test_mask = th.from_numpy(test_mask).float()

n_nodes = th.sum(th.tensor([g.number_of_nodes(n_type) for n_type in g.ntypes]))
n_edges = th.sum(th.tensor([g.number_of_edges(e_type) for e_type in g.etypes]))

print("""----Data statistics------'
            #Nodes: {}
            #Edges: {}
            #Features Shape: {}
            #Labeled Test samples: {}""".format(n_nodes,
                                                    n_edges,
                                                    features.shape,
                                                    test_mask.sum()))

if args.num_gpus:
    cuda = True
    device = th.device('cuda:0')
else:
    cuda = False
    device = th.device('cpu')

print("Initializing Model")
in_feats = features.shape[1]
n_classes = 2

ntype_dict = {n_type: g.number_of_nodes(n_type) for n_type in g.ntypes}

model = get_model(ntype_dict, g.etypes, vars(args), in_feats, n_classes, device)
print("Initialized Model")

features = features.to(device)

labels = labels.long().to(device)
test_mask = test_mask.to(device)

loss = th.nn.CrossEntropyLoss(th.tensor([1, 12.]))

# print(model)
optim = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

print("Starting Model training")
model, class_preds, pred_proba = train_fg(model, optim, loss, features, labels, g, g,
                                            test_mask, device, args.n_epochs,
                                            args.threshold,  args.compute_metrics)
print("Finished Model training")

print("Saving model")
save_model(g, model, args.model_dir, id_to_node, mean, stdev)
print("Model and metadata saved")
