# Simulation for Transactions, Anomalies and generated Features

Paper: A. Elliott, M. Cucuringu, M. M. Luaces, P. Reidy, and G. Reinert, Anomaly detection in networks with application to financial transaction networks, 2019

Based on the open source: https://github.com/zhangcheng1006/Anomaly-Detection-in-Networks

In [None]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

%matplotlib inline

In [None]:
import json
from numpyencoder import NumpyEncoder
from networkx.readwrite import json_graph
from datetime import datetime
from sklearn.model_selection import train_test_split

## Generating artificial networks and node features
This section is a demonstration of how to generate artificial networks and node features. Please to put all python scripts implementing the tool functions under the same directory of this notebook.

In [None]:
from rpy2.robjects.packages import importr
devtools = importr('devtools')
# devtools.install_github("dynverse/netdist", dependencies = True)
# devtools.install_github("alan-turing-institute/network-comparison")

ssss

In [None]:
from utils import generate_null_models, get_parameters
from generator import ER_generator, draw_anomalies
from basic_test import basic_features
from com_detection import community_detection
from spectral_localisation import spectral_features
from NetEMD import NetEMD_features
from path_finder import path_features

Set meta-parameters for network generation

In [None]:
num_models = 20     # original = 20
num_nodes = 1000    # original = 1000
num_basic_mc_samples = 500  # original = 500
num_references = 10     # original = 10
num_null_models = 60    # original = 60

Set 2 important parameters ```(p, w)```. ```p``` determines the edge density of the network, ```1-w``` is the under boundary of weight of the added anomaly edges.
Here all chosen ```(p, w)``` must satisfy the detectability constraints (see equations (11)-(15) in the paper).

In [None]:
ps = np.linspace(0.001, 0.05, 50)
ws = np.linspace(0.0, 0.01, 11)
candidate_parameters = get_parameters(num_nodes, ps, ws)
num_cand_param = len(candidate_parameters)

In [None]:
AML_TYPE_DICT = {None: 0, 'path': 1, 'star': 2, 'ring': 3, 'clique': 4, 'tree': 5}

Generate ```num_models``` models with randomly chosen parameters ```(p, w)```.

In [None]:
def generate_feature_graph(model_id, p, w):
    # p, w = candidate_parameters[np.random.choice(range(num_cand_param))]
    logging.info("Computing {}-th/{} model (p={:.3f}, w={:.3f})".format(model_id, num_models, p, w))
    graph = ER_generator(n=num_nodes, p=p, seed=None)
    graph = draw_anomalies(graph, w=1 - w)
    graph = calculate_graph_features(graph)
    return graph


def calculate_graph_features(graph):
    logging.info("\n\nGenerating null models 1\n\n")
    _, references = generate_null_models(graph, num_models=num_references, min_size=10)     # min_size=20 original
    logging.info("\n\nGenerating null models 2\n\n")
    null_samples_whole, null_samples = generate_null_models(graph, num_models=num_null_models, min_size=20)
    logging.info("\n\nGenerating NetEMD features\n\n")
    graph = NetEMD_features(graph, references, null_samples, num_references=num_references, num_samples=num_null_models)
    logging.info("\n\nGenerating basic features\n\n")
    graph = basic_features(graph, num_samples=num_basic_mc_samples)
    logging.info("\n\nGenerating community features\n\n")
    graph = community_detection(graph, null_samples, num_samples=20)
    logging.info("\n\nGenerating spectral features\n\n")
    graph = spectral_features(graph, null_samples, num_samples=num_null_models)
    logging.info("\n\nGenerating path features\n\n")
    graph = path_features(graph, null_samples_whole, num_samples=num_null_models)
    return graph


def write_json_graph(graph, model_id, p, w):
    data = json_graph.node_link_data(graph)
    with open('./data_small_feature_graph/Network_p_{:.3f}_w_{:.3f}_{}.json'.format(p, w, model_id), 'w') as outfile:
        json.dump(data, outfile, cls=NumpyEncoder)


def write_csv_df(graph, model_id, p, w):
    features = set()
    for node in graph.nodes():
        features |= set(graph.nodes[node].keys())
    # features.remove('type')
    logging.info("\n\nComposing DataFrame\n\n")
    X = pd.DataFrame.from_dict(dict(graph.nodes(data=True, default=0)), orient='index')
    X.fillna(0, inplace=True)
    X.replace([np.inf, -np.inf], 0, inplace=True)
    logging.info("\n\nWriting to local file\n\n")
    X.to_csv('./data_small_feature_graph/Network_p_{:.3f}_w_{:.3f}_{}.csv'.format(p, w, model_id))
    

def generate_multiple_graph_to_json_and_csv():
    for model_id in range(num_models):
        p, w = candidate_parameters[np.random.choice(range(num_cand_param))]
        graph = generate_feature_graph(model_id, p, w)
        write_json_graph(graph, model_id, p, w)
        write_csv_df(graph, model_id, p, w)


def generate_graph_dataset_json_for_fastgcn(model_id):
    p, w = candidate_parameters[np.random.choice(range(num_cand_param))]
    graph = generate_feature_graph(model_id, p, w)
    data = json_graph.node_link_data(graph)
    with open('data_run_test/big_graph_50k.json', 'w') as outfile:
        json.dump(data, outfile, cls=NumpyEncoder)


__Generating Feature Graph__:

In [None]:
start = datetime.now()
print('starting...................................: ', start)

generate_multiple_graph_to_json_and_csv()
# generate_graph_dataset_json_for_fastgcn(4)

end = datetime.now()
print('starting...................................: ', start)
print('finish.....................................: ', end)
print('duration...................................: ', (end - start))

__Discussion__:

In the paper, the authors test the model trained on the training set generated a specific parameter on the testing set generated by the same parameter. However, in pratice, we can never know how the testing network is generated. To overcome this problem, we generate networks with random parameters and hide this information during the whole procedure, which means we know nothing about the parameter neither on the training set nor the testing set.

## Transform graph to other formats

In [None]:
def create_class_map_json(G, path, file_name):
    class_map_json = {n: AML_TYPE_DICT[(G.nodes[n]).get("type", None)] for n in G.nodes()}
    with open(path + file_name + '-class_map.json', 'w') as outfile:
        json.dump(class_map_json, outfile, cls=NumpyEncoder)


def create_id_map_json(G, path, file_name):
    id_map_json = {n: ind for ind, n in enumerate(G.nodes())}
    with open(path + file_name + '-id_map.json', 'w') as outfile:
        json.dump(id_map_json, outfile, cls=NumpyEncoder)


def create_feats_npy(G, path, file_name):
    g_df = pd.DataFrame.from_dict(G.nodes, orient='index')
    g_df = g_df.fillna(0)
    g_df = g_df.drop('type', axis=1)
    feats = g_df.to_numpy()  # get df after removing type and index columns
    np.save(path + file_name + '-feats.npy', feats)


def create_train_val_test_graph(G, path, file_name):
    mapping = dict(zip(G.nodes(), map(str, G.nodes())))
    G = nx.relabel_nodes(G, mapping)

    class_map_json = {n: AML_TYPE_DICT[G[n].get("type", None)] for n in G.nodes()}
    x = list(class_map_json.keys())
    y = list(class_map_json.values())
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.2, random_state=0, stratify=y)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=.50, random_state=0, stratify=y_train)

    for n in G.nodes():
        G.nodes[n]['test'] = False
        G.nodes[n]['val'] = False
    for n in x_train:
        G.nodes[n]['test'] = True
    for n in x_val:
        G.nodes[n]['val'] = True

    data = json_graph.node_link_data(G)
    with open(path + file_name + '-updated.json', 'w') as outfile:
        json.dump(data, outfile, cls=NumpyEncoder)


def standard_graph_to_multiple_datasource(path, file_name):
    G = json_graph.node_link_graph(json.load(open(path + file_name + '.json')))
    create_class_map_json(G, path, file_name)
    create_id_map_json(G, path, file_name)
    create_feats_npy(G, path, file_name)
    create_train_val_test_graph(G, path, file_name)

__Transforming Feature Graph__:

In [None]:
start = datetime.now()
print('starting...................................: ', start)

standard_graph_to_multiple_datasource('data_fastgcn/input/', 'Network_p_0.016_w_0.003_1')

end = datetime.now()
print('starting...................................: ', start)
print('finish.....................................: ', end)
print('duration...................................: ', (end - start))

## Transform dataframe to graph

In [None]:
def transfer_data_format(dataset_dir, output_file):
    G = json_graph.node_link_graph(json.load(open(dataset_dir + "/Network_p_0.016_w_0.003_1-updated.json")))
    labels = json.load(open(dataset_dir + "/Network_p_0.016_w_0.003_1-class_map.json"))

    train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]
    test_ids = [n for n in G.nodes() if G.node[n]['test']]
    val_ids = [n for n in G.nodes() if G.node[n]['val']]
    train_labels = [labels[i] for i in train_ids]
    test_labels = [labels[i] for i in test_ids]
    val_labels = [labels[i] for i in val_ids]
    feats = np.load(dataset_dir + "/Network_p_0.016_w_0.003_1-feats.npy")

    ## Logistic gets thrown off by big counts, so log transform num comments and score
    feats[:, 0] = np.log(feats[:, 0] + 1.0)
    feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1))
    feat_id_map = json.load(open(dataset_dir + "/Network_p_0.016_w_0.003_1-id_map.json"))
    feat_id_map = {id: val for id, val in feat_id_map.items()}

    train_index = [feat_id_map[id] for id in train_ids]
    val_index = [feat_id_map[id] for id in val_ids]
    test_index = [feat_id_map[id] for id in test_ids]
    np.savez(dataset_dir + output_file, feats=feats, y_train=train_labels, y_val=val_labels, y_test=test_labels,
             train_index=train_index,
             val_index=val_index, test_index=test_index)