In [1]:
import torch
print(torch.__version__)
!pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch_geometric
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
!pip install dgl
import dgl
from dgl.nn.pytorch.conv import GraphConv, APPNPConv
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

import pandas as pd
pd.set_option('max_colwidth', None)
import numpy as np
import time

import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

1.11.0+cu113
Looking in links: https://data.pyg.org/whl/torch-1.11.0+cu113.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 5.2 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_sparse-0.6.13-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 78.0 MB/s 
Installing collected packages: torch-sparse, torch-scatter
Successfully installed torch-scatter-2.0.9 torch-sparse-0.6.13
Collecting torch_geometric
  Downloading torch_geometric-2.0.4.tar.gz (407 kB)
[K     |████████████████████████████████| 407 kB 3.3 MB/s 
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.0.4-py3-none-any.whl size=616603 sha256=2cd1583102bb60bb0d20

DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


In [2]:
# Auxiliary functions
def plot_confusion(ytest, ypred):
    # function that plots confusion matrix given actual and predicted values

    ypred = [1 if item>0.5 else 0 for item in ypred]

    conf_mat = confusion_matrix(ytest, ypred)

    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in conf_mat.flatten()]

    group_percentages = ["{0:.2%}".format(value) for value in conf_mat.flatten()/np.sum(conf_mat)]

    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    fig1, ax1 = plt.subplots(figsize=(8, 6));
    ax = sns.heatmap(conf_mat, annot=labels, fmt='', cmap='Blues')

    ax.set_title('Confusion Matrix');
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Actual Values ');

    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])

    plt.show()


## Set data

In [3]:
ell_classes = pd.read_csv('/content/drive/MyDrive/Original_Elliptic/raw/elliptic_txs_classes.csv')
ell_edges = pd.read_csv('/content/drive/MyDrive/Original_Elliptic/raw/elliptic_txs_edgelist.csv')
ell_features_all = pd.read_csv('/content/drive/MyDrive/Original_Elliptic/raw/elliptic_txs_features.csv', header=None)
ell_classes_augmented = pd.read_csv('/content/drive/MyDrive/Augmented_Elliptic/elliptic_txs_classes_augmented.csv')
ell_classes_augmented_in = pd.read_csv('/content/drive/MyDrive/Augmented_Elliptic/elliptic_txs_classes_augmented_input.csv')



# rename the classes to ints that can be handled by pytorch as labels
ell_classes["label"] = ell_classes["class"].replace(
    {"unknown": -1,  # unlabeled nodes
     "2": 0,  # labeled licit nodes
     #"1": 1,  # labeled illicit nodes
    }
).astype(int)

ell_classes_augmented["label"] = ell_classes_augmented["class"].fillna(-1)
ell_classes_augmented["label"] = ell_classes_augmented["label"].replace(
    {-1: -1,  # unlabeled nodes
     0: 0,  # labeled licit nodes
     1: 1,  # labeled illicit nodes on input or output
     2: 1,  # labeled illicit nodes on input and output
    }
).astype(int)

ell_classes_augmented_in["label"] = ell_classes_augmented_in["class"].fillna(-1)
ell_classes_augmented_in["label"] = ell_classes_augmented_in["label"].replace(
    {-1: -1,  # unlabeled nodes
     0: 0,  # labeled licit nodes
     1: 1,  # labeled illicit nodes on input or output
     2: 1,  # labeled illicit nodes on input and output
    }
).astype(int)

# rename features according to data description in paper
rename_dict = dict(
    zip(
        range(0, 167),
        ["txId", "time_step"]
        + [f"local_{i:02d}" for i in range(1, 94)]
        + [f"aggr_{i:02d}" for i in range(1, 73)],
    )
)
ell_features_all.rename(columns=rename_dict, inplace=True)

ell_features_local = ell_features_all.iloc[:,0:95]


In [4]:
print(ell_classes['label'].value_counts())
print(ell_classes_augmented['label'].value_counts())
print(ell_classes_augmented_in['label'].value_counts())

-1    157205
 0     42019
 1      4545
Name: label, dtype: int64
-1    150368
 0     41835
 1     11566
Name: label, dtype: int64
-1    157234
 0     39720
 1      6815
Name: label, dtype: int64


In [5]:
ell_features = ell_features_local
ell_classes = ell_classes_augmented_in

In [6]:
# instantiate the multidirected graph
g_nx = nx.MultiDiGraph() 
# add nodes with the txid and a dictionary with the label
g_nx.add_nodes_from(zip(ell_classes['txId'], [{'label': v} for v in ell_classes['label']]))
# add edges
g_nx.add_edges_from(zip(ell_edges['txId1'], ell_edges['txId2']));

# print graph basic statistics
print(f"Graph with {g_nx.number_of_nodes()} nodes and {g_nx.number_of_edges()} edges.")
print(f"Number of connected components: {len(list(nx.weakly_connected_components(g_nx)))}")

Graph with 203769 nodes and 234355 edges.
Number of connected components: 49


In [7]:
# store all the weekly connected components in a list
components = list(nx.weakly_connected_components(g_nx))
# create a list of subgraphs with each elements one of the weakly connected components
g_nx_t_list = [g_nx.subgraph(components[i]) for i in range(0,len(components))]

In [8]:
# choose a subgraph
sg = 1
g_nx_t_list[sg].number_of_nodes()

4544

In [9]:
# create unidirectional graph
'''
g = dgl.from_networkx(g_nx)
g.ndata["label"] = torch.tensor(ell_classes.set_index("txId").loc[sorted(g_nx.nodes()), "label"].values)
g.ndata["features_matrix"] = torch.tensor(ell_features.set_index("txId").loc[sorted(g_nx.nodes()), :].values)
print(g)
'''

'\ng = dgl.from_networkx(g_nx)\ng.ndata["label"] = torch.tensor(ell_classes.set_index("txId").loc[sorted(g_nx.nodes()), "label"].values)\ng.ndata["features_matrix"] = torch.tensor(ell_features.set_index("txId").loc[sorted(g_nx.nodes()), :].values)\nprint(g)\n'

In [10]:
g_nx_bidirectional = g_nx.to_undirected().to_directed()

# create bidirectional graph
g_bi = dgl.from_networkx(g_nx_bidirectional)
g_bi.ndata["label"] = torch.tensor(ell_classes.set_index("txId").loc[sorted(g_nx.nodes()), "label"].values)
g_bi.ndata["features_matrix"] = torch.tensor(ell_features.set_index("txId").loc[sorted(g_nx.nodes()), :].values)
print(g_bi)

Graph(num_nodes=203769, num_edges=468710,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'features_matrix': Scheme(shape=(94,), dtype=torch.float64)}
      edata_schemes={})


In [11]:
#g.add_edges(g.nodes(), g.nodes())
#print(g)
# add self loop to the bidirectional edges graph (normalization)
g_bi.add_edges(g_bi.nodes(), g_bi.nodes())
print(g_bi)

Graph(num_nodes=203769, num_edges=672479,
      ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'features_matrix': Scheme(shape=(94,), dtype=torch.float64)}
      edata_schemes={})


In [12]:
features = g_bi.ndata["features_matrix"].float()
labels = g_bi.ndata["label"].long()  # format required for cross entropy loss
in_feats = features.shape[1]
n_classes = 2  # licit or illicit (unknown label is ignored)
n_edges = g_bi.number_of_edges()

dataset_size = ell_classes["label"].notna().sum()
train_ratio = 0.7
train_time_steps = round(len(np.unique(features[:, 0])) * train_ratio)
shutdown_timestep = 43
train_indices = (((features[:, 0] <= train_time_steps) & (labels != -1)).nonzero().view(-1))
val_indices = (((features[:, 0] > train_time_steps) & (labels != -1)).nonzero().view(-1))

print(f"""Number of timesteps used for training: {train_time_steps}
Number of timesteps used for validation: {dataset_size-train_time_steps}""")

Number of timesteps used for training: 34
Number of timesteps used for validation: 203735


## Architecture

In [13]:
# Simple architecture
class simple(nn.Module):
  def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, bias):
    super(simple, self).__init__()
            
    self.in_size = in_feats
    self.n_hidden = n_hidden
    self.out_size = n_classes
    self.dropout = dropout

    self.network = nn.Sequential(
      nn.Linear(self.in_size, 166), 
      nn.ReLU(), 
      nn.Linear(166, 64),
      nn.ReLU(), 
      nn.Dropout(dropout),
      nn.Linear(64, n_classes),
      nn.LogSoftmax(dim=1))
    
  def forward(self, x):
      x = x.view(-1, self.in_size)
      return self.network(x)

# Basic Graph Convolutional Network
class GCN(nn.Module):
    def __init__(self, g, in_feats, n_hidden, n_classes, n_layers, activation, dropout, bias):
        super(GCN, self).__init__()
        self.g = g
        self.layers = nn.ModuleList()
        
        # input layer
        self.layers.append(GraphConv(in_feats, n_hidden, activation=activation, bias=bias))
        
        # hidden layers
        for _ in range(n_layers - 2):
            self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation, bias=bias))
        
        # output layer
        self.layers.append(GraphConv(n_hidden, n_classes, bias=bias))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            h = layer(self.g, h)
        return h
# Approximated Personalized Propagation of Neural Predictions
class APPNP(nn.Module):
    def __init__(
        self,
        g,
        in_feats,
        n_hidden,
        n_classes,
        n_layers,
        activation,
        feat_drop,
        edge_drop,
        alpha,
        k,
    ):
        super(APPNP, self).__init__()
        self.g = g
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(nn.Linear(in_feats, n_hidden))
        # hidden layers
        for _ in range(n_layers - 2):
            self.layers.append(nn.Linear(n_hidden, n_hidden))
        # output layer
        self.layers.append(nn.Linear(n_hidden, n_classes))
        self.activation = activation
        if feat_drop:
            self.feat_drop = nn.Dropout(feat_drop)
        else:
            self.feat_drop = lambda x: x
        self.propagate = APPNPConv(k, alpha, edge_drop)
        self.reset_parameters()

    def reset_parameters(self):
        for layer in self.layers:
            layer.reset_parameters()

    def forward(self, features):
        # prediction step
        h = features
        h = self.feat_drop(h)
        h = self.activation(self.layers[0](h))
        for layer in self.layers[1:-1]:
            h = self.activation(layer(h))
        
        h = self.layers[-1](self.feat_drop(h))
        
        # propagation step
        h = self.propagate(self.g, h)
        return h

# utility function to evaluate the model
def evaluate(model, loss_fcn, features, labels, mask):
    """Calculate the loss, accuracy, precision, recall and f1_score for the masked data"""
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        loss = loss_fcn(logits, labels)
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        p, r, f, _ = precision_recall_fscore_support(labels, indices)
        return loss, correct.item() * 1.0 / len(labels), p[1], r[1], f[1]
    

# utility function to obtain a confusion matrix
def eval_confusion_matrix(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
    plot_confusion(labels, indices)



In [14]:
def train_eval_model(model, model_class, g, features, **params):
    
    #bidirectional = params["bidirectional"] if "bidirectional" in params else None
    in_feats = features.shape[1]
    n_classes = 2
    n_hidden = params["n_hidden"]
    n_layers = params["n_layers"]
    weight_decay = params["weight_decay"]
    bias = params["bias"]
    dropout = params["dropout"]
    epochs = params["epochs"]
    lr = params["lr"]
    posweight = params["posweight"]
    
    if model == 'APPNP' :
      alpha = params["alpha"]
      k = params["k"]
      model = model_class(g, in_feats, n_hidden, n_classes, n_layers, F.relu, dropout, bias, alpha, k)
    if model == 'GCN' or model == 'Simple':
      model = model_class(g, in_feats, n_hidden, n_classes, n_layers, F.relu, dropout, bias)
    
    # weighted cross entropy loss function
    loss_fcn = torch.nn.CrossEntropyLoss(weight=torch.tensor([1 - posweight, posweight]))

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    dur = []
    metrics = {"loss":{"train": [], "val": []},
               "accuracy":{"train": [], "val": []},
               "precision":{"train": [], "val": []},
               "recall":{"train": [], "val": []},
               "f1_score":{"train": [], "val": []},
              }

    for epoch in range(epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward pass
        logits = model(features)
        loss = loss_fcn(logits[train_indices], labels[train_indices])
        metrics["loss"]["train"].append(loss)
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # duration
        if epoch >= 3:
            dur.append(time.time() - t0)
        # evaluate on training set
        _, train_acc, train_precision, train_recall, train_f1_score = evaluate(model, loss_fcn, features, labels, train_indices)
        metrics["accuracy"]["train"].append(train_acc)
        metrics["precision"]["train"].append(train_precision)
        metrics["recall"]["train"].append(train_recall)
        metrics["f1_score"]["train"].append(train_f1_score)
        # evaluate on validation set
        val_loss, val_acc, val_precision, val_recall, val_f1_score = evaluate(model, loss_fcn, features, labels, val_indices)
        metrics["loss"]["val"].append(val_loss)
        metrics["accuracy"]["val"].append(val_acc)
        metrics["precision"]["val"].append(val_precision)
        metrics["recall"]["val"].append(val_recall)
        metrics["f1_score"]["val"].append(val_f1_score)
        if (epoch + 1) % 100 == 0:
            print(
                f"Epoch {epoch:05d} | Time(s) {np.mean(dur):.2f} | val_loss {val_loss.item():.4f} "
                f"| Precision {val_precision:.4f} | Recall {val_recall:.4f} | Acc {val_acc:.4f} "
                f"| F1_score {val_f1_score:.4f}"
            )

    print("Confusion matrix:")
    eval_confusion_matrix(model, features, labels, val_indices)
    return model, metrics

In [15]:
# set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import warnings
warnings.filterwarnings('ignore')

params_simple = {
    'n_hidden' : 64,
    'n_layers' : 2, 
    'weight_decay' : 0., 
    'bias' : False,
    'epochs' : 1000,
    'lr' : 1e-3,
    'posweight': 0.7,
    'dropout' : 0.5,
}

params_GCN = {
    "n_hidden" : 128,
    "n_layers" : 2,
    "weight_decay" : 0.,
    "bias" : False,
    "dropout" : 0.25,
    "epochs" : 1000,
    "lr" : 1e-3,
    "posweight" : 0.7,
}

params_APPNP = {
    "n_hidden" : 256,
    "n_layers" : 2,
    "weight_decay" : 0.,
    "bias" : False,
    "dropout" : 0.2,
    "epochs" : 1000,
    "lr" : 1e-3,
    "posweight" : 0.7,
    "alpha" : 0.2,
    "k" : 20,
}

# train on graph bidirectional edges
#model, metrics = train_eval_model('Simple', simple, g_bi, features, **params_simple)
model, metrics = train_eval_model('GCN', GCN, g_bi, features, **params_GCN)
#model, metrics = train_eval_model('APPNP', APPNP, g_bi, features, **params_APPNP)

Epoch 00099 | Time(s) 0.18 | val_loss 0.5930 | Precision 0.2085 | Recall 0.4409 | Acc 0.7969 | F1_score 0.2831
Epoch 00199 | Time(s) 0.18 | val_loss 0.5840 | Precision 0.2386 | Recall 0.4509 | Acc 0.8192 | F1_score 0.3121
Epoch 00299 | Time(s) 0.18 | val_loss 0.5543 | Precision 0.3294 | Recall 0.4294 | Acc 0.8686 | F1_score 0.3728


KeyboardInterrupt: ignored