In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.11.0+cu113


In [36]:
import torch
import pickle
import random
import numpy as np
import math
from torch_geometric.data import DenseDataLoader

from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GATConv, BatchNorm, GraphNorm, GCNConv, DenseGraphConv, DenseSAGEConv
from torch_geometric.nn import global_mean_pool, dense_diff_pool, dense_mincut_pool
from torch_geometric.utils import to_dense_adj, to_dense_batch
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


cuda:0


In [37]:
with open(r'/content/drive/MyDrive/Augmented_Elliptic/address_level/address_subgraphs.pkl', "rb") as input_file:
    dataset = pickle.load(input_file)


In [38]:
print('====================')
print(f'Number of graphs: {len(dataset)}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Number of graphs: 102077

Data(edge_index=[2, 6060], interactions=[6060], num_nodes=98, x=[98, 25], y=node
2397173837    0
Name: class, dtype: int64, edge_attr=[6060, 5])
Number of nodes: 98
Number of edges: 6060
Average node degree: 61.84
Has isolated nodes: False
Has self-loops: True
Is undirected: False


In [39]:
classes = {dataset[i].y.index.map(str)[0]: dataset[i].y.item() for i in range(len(dataset))}
classes_1 = dict(filter(lambda i: i[1] == 1, classes.items()))
classes_0 = dict(filter(lambda i: i[1] == 0, classes.items()))

print(f'Number of high risk addresses: {len(classes_1.keys())}')
print(f'Number of safe addresses: {len(classes_0.keys())}')

# Balance the high risk and safe addresses (undersampling safe addresses)
random.seed(1993)
sample_safe = random.sample(list(classes_0.keys()), k = len(classes_1.keys()))

classes_0_trim = dict(filter(lambda i: i[0] in sample_safe, classes_0.items()))
classes_balanced = {**classes_1, **classes_0_trim}

Number of high risk addresses: 10169
Number of safe addresses: 91579


Need a way to get high risk graphs and then train against balanced number of licit graphs

In [40]:
classes_balanced_keys = list(classes_balanced.keys())
train_share = 0.8

train_keys = random.sample(classes_balanced_keys, k = round(train_share*len(classes_balanced_keys)))
test_keys = random.sample(classes_balanced_keys, k = len(classes_balanced_keys) - round(train_share*len(classes_balanced_keys)))

train_dataset = [graph for graph in dataset if graph.y.index[0] in train_keys] 
test_dataset = [graph for graph in dataset if graph.y.index[0] in test_keys]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of testing graphs: {len(test_dataset)}')

Number of training graphs: 16270
Number of testing graphs: 4068


In [41]:
for data in train_dataset:
  data.y = torch.tensor(data.y).type(torch.LongTensor)

for data in test_dataset:
  data.y = torch.tensor(data.y).type(torch.LongTensor) 

In [42]:
from torch_geometric.loader import DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
'''
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()
'''



In [43]:
# GAT

class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, heads):
        super(GAT, self).__init__()
        torch.manual_seed(12345)
        self.gat_node_1 = GATConv(18, hidden_channels, edge_dim = 2, heads = heads)
        self.gat_node_2 = GATConv(heads * hidden_channels, hidden_channels, edge_dim = 2, heads = heads)
        self.gat_node_3 = GATConv(heads * hidden_channels, out_channels, edge_dim = 2, heads = heads)

        self.lin = Linear(out_channels * heads, 2)
        
        self.graphnorm1 = GraphNorm(hidden_channels * heads)
        self.graphnorm2 = GraphNorm(hidden_channels * heads)
        self.graphnorm3 = GraphNorm(out_channels * heads)

    def forward(self, x, edge_index, edge_attr, batch):
        # 1. Obtain node embeddings 
        x = x.float()
        x = self.gat_node_1(x, edge_index, edge_attr = edge_attr)
        x = self.graphnorm1(x)
        x = F.leaky_relu(x)
        x = self.gat_node_2(x, edge_index, edge_attr = edge_attr)
        x = self.graphnorm2(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.leaky_relu(x)
        x = self.gat_node_3(x, edge_index, edge_attr = edge_attr)
        x = self.graphnorm3(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.leaky_relu(x)

        # 2. Readout layer - turns node and edge embeddings into a graph embedding
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.lin(x)
        x = F.softmax(x, dim = 1)
        
        return x



In [44]:
# DiffPool

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, normalize=False, lin=True):
        super().__init__()

        self.conv1 = DenseSAGEConv(in_channels, hidden_channels, normalize)
        self.bn1 = GraphNorm(hidden_channels)
        self.conv2 = DenseSAGEConv(hidden_channels, hidden_channels, normalize)
        self.bn2 = GraphNorm(hidden_channels)
        self.conv3 = DenseSAGEConv(hidden_channels, out_channels, normalize)
        self.bn3 = GraphNorm(out_channels)

        if lin is True:
            self.lin = torch.nn.Linear(2 * hidden_channels + out_channels,
                                       out_channels)
        else:
            self.lin = None

    def bn(self, i, x):
        batch_size, num_nodes, num_channels = x.size()

        x = x.view(-1, num_channels)
        x = getattr(self, f'bn{i}')(x)
        x = x.view(batch_size, num_nodes, num_channels)
        return x

    def forward(self, x, adj, batch, mask=None):
        #batch_size, num_nodes, in_channels = x.size()

        x0 = x

        adj_dense = to_dense_adj(adj, batch)
        
        x1 = self.bn(1, self.conv1(x0, adj, mask).relu())
        x2 = self.bn(2, self.conv2(x1, adj, mask).relu())
        x3 = self.bn(3, self.conv3(x2, adj, mask).relu())

        x = torch.cat([x1, x2, x3], dim=-1)

        if self.lin is not None:
            x = self.lin(x).relu()

        return x


class DiffPool(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.max_nodes = 0
        for data in dataset:
          temp_max = data.num_nodes
          if temp_max>self.max_nodes:
            self.max_nodes = temp_max

        num_nodes = math.ceil(0.25 * self.max_nodes)
        self.gnn1_pool = GNN(18, 64, num_nodes)
        self.gnn1_embed = GNN(18, 64, 64, lin=False)

        num_nodes = math.ceil(0.25 * num_nodes)
        self.gnn2_pool = GNN(3 * 64, 64, num_nodes)
        self.gnn2_embed = GNN(3 * 64, 64, 64, lin=False)

        self.gnn3_embed = GNN(3 * 64, 64, 64, lin=False)

        self.lin1 = torch.nn.Linear(3 * 64, 64)
        self.lin2 = torch.nn.Linear(64, 2)

    def forward(self, x, adj, batch, mask=None):
        
        s = self.gnn1_pool(x, adj, mask)
        x = self.gnn1_embed(x, adj, mask)

        #x, mask = to_dense_batch(x, batch)
        adj = to_dense_adj(adj, batch)

        x, adj, l1, e1 = dense_diff_pool(x, adj, s, mask)

        s = self.gnn2_pool(x, adj)
        x = self.gnn2_embed(x, adj)

        x, adj, l2, e2 = dense_diff_pool(x, adj, s)

        x = self.gnn3_embed(x, adj)

        x = x.mean(dim=1)
        x = self.lin1(x).relu()
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1), l1 + l2, e1 + e2

In [45]:
# MinCut
class MinCut(torch.nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels=32):
        super().__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        num_nodes = math.ceil(0.5 * average_nodes)
        self.pool1 = Linear(hidden_channels, num_nodes)

        self.conv2 = DenseGraphConv(hidden_channels, hidden_channels)
        num_nodes = math.ceil(0.5 * num_nodes)
        self.pool2 = Linear(hidden_channels, num_nodes)

        self.conv3 = DenseGraphConv(hidden_channels, hidden_channels)

        self.lin1 = Linear(hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index).relu()

        x, mask = to_dense_batch(x, batch)
        adj = to_dense_adj(edge_index, batch)

        s = self.pool1(x)
        x, adj, mc1, o1 = dense_mincut_pool(x, adj, s, mask)

        x = self.conv2(x, adj).relu()
        s = self.pool2(x)

        x, adj, mc2, o2 = dense_mincut_pool(x, adj, s)

        x = self.conv3(x, adj)

        x = x.mean(dim=1)
        x = self.lin1(x).relu()
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1), mc1 + mc2, o1 + o2


In [51]:
average_nodes = 0

for data in dataset:
    average_nodes = average_nodes + int(data.x.size(0) / len(data))

average_nodes = average_nodes / len(dataset)
print(average_nodes)
average_nodes = 2000

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = GAT(hidden_channels = 128, out_channels = 32, heads = 10).to(device)
#model = DiffPool().to(device)
model = MinCut(in_channels = 25, hidden_channels = 256, out_channels = 2).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)
print(model)

MinCut(
  (conv1): GCNConv(25, 256)
  (pool1): Linear(in_features=256, out_features=1000, bias=True)
  (conv2): DenseGraphConv(256, 256)
  (pool2): Linear(in_features=256, out_features=500, bias=True)
  (conv3): DenseGraphConv(256, 256)
  (lin1): Linear(in_features=256, out_features=256, bias=True)
  (lin2): Linear(in_features=256, out_features=2, bias=True)
)


In [52]:
import warnings
warnings.filterwarnings('ignore')

#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
#criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
         data = data.to(device)
         #print(data)
         #data.x = data.x.view(, data.num_nodes, 1)
         data.y = data.y.type(torch.LongTensor).to(device)
         optimizer.zero_grad()  # Clear gradients.
         #out = model(data.x, data.edge_index, data.edge_attr, data.batch) # GAT
         out, mc_loss, o_loss = model(data.x.type(torch.float), data.edge_index, data.batch) # MinCutPool
         #out = model(data.x.type(torch.float), data.edge_index, data.batch) # DiffPool
         #loss = criterion(out, data.y)  # Compute the loss.
         loss = F.nll_loss(out, data.y.view(-1)) + mc_loss + o_loss
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.


def test(loader):
     model.eval()
        
     metrics = {"accuracy":0,
               "precision":0,
               "recall":0,
               "f1_score":0}

     correct = 0
     actual, predicted = list(), list()
     for data in loader:  # Iterate in batches over the training/test dataset.
         data = data.to(device)
         
         data.y = data.y.type(torch.LongTensor).to(device)
         #out = model(data.x, data.edge_index, data.edge_attr, data.batch) # GAT
         out, mc_loss, o_loss = model(data.x.type(torch.float), data.edge_index, data.batch) # MinCutPool
         #out = model(data.x.type(torch.float), data.edge_index, data.batch) # DiffPool
         pred = out.cpu().argmax(dim=1).tolist()  # Use the class with highest probability.
         true = data.y.cpu().tolist()
         correct += sum(x == y for x, y in zip(true, pred)) # Check against ground-truth labels.
         actual += true
         predicted += pred 

     metrics['accuracy'] = correct / len(loader.dataset)
     metrics['precision'], metrics['recall'], metrics['f1_score'], _ = precision_recall_fscore_support(actual, predicted, average = 'binary') # precision, recall, f1score
     
     return metrics  # Derive ratio of correct predictions.

epochs = 100
for epoch in range(epochs):
    train()
    train_metrics = test(train_loader)
    test_metrics = test(test_loader)

    #print(train_metrics)
    #if epoch%10==0  or epoch==epochs:
    print(f'Epoch: {epoch:03d}') 
    print(f'Tr Acc: {train_metrics["accuracy"]:.3f}, Tr Rec: {train_metrics["recall"]:.3f}, Tr Pre: {train_metrics["precision"]:.3f}, Tr F1: {train_metrics["f1_score"]:.3f} '
            f'Te Acc: {test_metrics["accuracy"]:.3f}, Te Rec: {test_metrics["recall"]:.3f}, Te Pre: {test_metrics["precision"]:.3f}, Te F1: {test_metrics["f1_score"]:.3f}')

Epoch: 000
Tr Acc: 0.629, Tr Rec: 0.564, Tr Pre: 0.647, Tr F1: 0.603 Te Acc: 0.628, Te Rec: 0.576, Te Pre: 0.668, Te F1: 0.619
Epoch: 001
Tr Acc: 0.701, Tr Rec: 0.737, Tr Pre: 0.686, Tr F1: 0.711 Te Acc: 0.703, Te Rec: 0.746, Te Pre: 0.704, Te F1: 0.724
Epoch: 002
Tr Acc: 0.643, Tr Rec: 0.710, Tr Pre: 0.625, Tr F1: 0.665 Te Acc: 0.644, Te Rec: 0.709, Te Pre: 0.645, Te F1: 0.676
Epoch: 003
Tr Acc: 0.715, Tr Rec: 0.805, Tr Pre: 0.681, Tr F1: 0.738 Te Acc: 0.724, Te Rec: 0.816, Te Pre: 0.704, Te F1: 0.756
Epoch: 004
Tr Acc: 0.742, Tr Rec: 0.697, Tr Pre: 0.765, Tr F1: 0.730 Te Acc: 0.749, Te Rec: 0.711, Te Pre: 0.788, Te F1: 0.747
Epoch: 005
Tr Acc: 0.742, Tr Rec: 0.790, Tr Pre: 0.720, Tr F1: 0.753 Te Acc: 0.741, Te Rec: 0.787, Te Pre: 0.736, Te F1: 0.761
Epoch: 006
Tr Acc: 0.720, Tr Rec: 0.625, Tr Pre: 0.770, Tr F1: 0.690 Te Acc: 0.711, Te Rec: 0.626, Te Pre: 0.778, Te F1: 0.694
Epoch: 007
Tr Acc: 0.746, Tr Rec: 0.783, Tr Pre: 0.728, Tr F1: 0.755 Te Acc: 0.752, Te Rec: 0.791, Te Pre: 0.74

KeyboardInterrupt: ignored