In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./input/elliptic_txs_features_test.csv
./input/elliptic_txs_classes.csv
./input/elliptic_txs_edgelist.csv
./input/elliptic_txs_features.csv


In [2]:
import pandas as pd 
import networkx as nx
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

id_time=["txId", "time_step"]
feature_names = ['feature_'+str(i) for i in range(1,166)]
column_names = id_time + feature_names
elliptic_classes = pd.read_csv('./input/elliptic_txs_classes.csv')
elliptic_classes.columns = ['txId', 'class_label']
elliptic_edgelist = pd.read_csv('./input/elliptic_txs_edgelist.csv')
elliptic_features = pd.read_csv('./input/elliptic_txs_features.csv', names=column_names)
elliptic_features["centrality"] =None

In [63]:
len(elliptic_features[elliptic_features["time_step"]>0])

203769

## Basic Model

In [89]:
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.datasets import EllipticBitcoinDataset
#from torch_geometric.logging import init_wandb, log
from torch_geometric.nn import GCNConv

dataset = EllipticBitcoinDataset(root='./pytorch_input')

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        #self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # x: Node feature matrix of shape [num_nodes, in_channels]
        # edge_index: Graph connectivity matrix of shape [2, num_edges]
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv1(x, edge_index).relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        #x = self.conv2(x, edge_index).relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        # print(self.conv3(x, edge_index).shape)
        x = self.conv3(x, edge_index).softmax(dim=1)
        return x


In [88]:
simplemodel = GCN(dataset.num_features, 32, dataset.num_classes+1)
data=dataset[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
labeled = data.y>0
randomize = torch.FloatTensor(data.y.shape[0]).uniform_()
classweight = torch.FloatTensor([int((data.y == 1).sum()),int((data.y == 0).sum()),0])
data.train_mask = torch.logical_and(data.y <2,randomize>0.2)# 80% of labeled
data.test_mask = torch.logical_and(data.y <2,randomize< 0.2)# rest 20% of labeled
# init_wandb(name=f'GCN-{args.dataset}', lr=args.lr, epochs=args.epochs,            hidden_channels=args.hidden_channels, device=device)
simplemodel, data = simplemodel.to(device), data.to(device)
optimizer = torch.optim.Adam([ dict(params=simplemodel.conv1.parameters(), weight_decay=5e-4), 
dict(params=simplemodel.conv2.parameters(), weight_decay=5e-4),
dict(params=simplemodel.conv3.parameters(), weight_decay=0)], lr=0.005)  # Only perform weight-decay on first convolution.


def train(model,d):
    model.train()
    optimizer.zero_grad()
    out = model(d.x, d.edge_index)
    #print(out.shape,new_y.shape)
    #print(data.y[data.test_mask].shape)
    loss = F.cross_entropy(out[d.train_mask], d.y[d.train_mask],weight=classweight)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(model,d):
    model.eval()
    pred = model(d.x, d.edge_index).argmax(dim=-1)
    accs = []
    for mask in [d.train_mask, d.test_mask]:
        accs.append(int((pred[mask] == d.y[mask]).sum()) / int(mask.sum()))
    return accs


    #log(Epoch=epoch, Loss=loss, Train=train_acc, Val=0, Test=test_acc)

AttributeError: 'GCN' object has no attribute 'conv2'

In [131]:
#70/30 split, first 34 timestep, occurs at index 136265
newsplit =  torch.Tensor([True]*136265+[False]*67504)
data.train_mask = torch.logical_and(data.y <2,newsplit)
data.test_mask = torch.logical_and(data.y <2,torch.logical_not(newsplit))
#a 2-layer GCN,size of the node embeddings to be 100
papermodel = GCN(dataset.num_features, 100, dataset.num_classes+1).to(device)
#Adam Optimizer with learning rate of 0.001
optimizer = torch.optim.Adam(params=papermodel.parameters(),lr=0.001) 
#a weighted cross entropy loss 0.3/0.7 ratio for the licit and illicit classes (y = 0 is licit and y=1 is illicit)
classweight = torch.FloatTensor([0.3,0.7,0])
#trained the GCN model for 1000 epochs
re = []
for epoch in range(1, 1000):
    loss = train(papermodel,data)
    train_acc, tmp_test_acc = test(papermodel,data)
    re.append([epoch,loss,train_acc,tmp_test_acc])
    print(epoch,loss,train_acc,tmp_test_acc)

1 1.2605295181274414 0.1813741888004282 0.42939412117576486
2 1.2236077785491943 0.2504181441091858 0.4683863227354529
3 1.1807528734207153 0.33140429517628955 0.5064187162567486
4 1.1314644813537598 0.4369438683347829 0.5516496700659868
5 1.0747287273406982 0.5405767043553891 0.5931613677264547
6 1.0157239437103271 0.6211948886064094 0.6296940611877625
7 0.9675818681716919 0.6820432193751255 0.6546490701859629
8 0.9270380735397339 0.7299792600521844 0.6710257948410318
9 0.8929656147956848 0.7667424901317991 0.6854229154169166
10 0.8646687865257263 0.791663879039272 0.7011997600479905
11 0.8412269353866577 0.8094935438549542 0.7110377924415117
12 0.8217549324035645 0.8245801833143774 0.7197960407918417
13 0.8055160641670227 0.838161503980732 0.7257948410317936
14 0.7919061779975891 0.8468254499230615 0.7317936412717456
15 0.7804004549980164 0.8553221382217168 0.7381523695260948
16 0.7705780267715454 0.86064093129056 0.7409718056388722
17 0.7621263265609741 0.865759015186994 0.744031193

In [132]:
pred = papermodel(data.x, data.edge_index).argmax(dim=-1)
from torchmetrics.functional import precision_recall
print(precision_recall(pred[data.train_mask], data.y[data.train_mask], average='none', num_classes=3))
print(precision_recall(pred[data.test_mask], data.y[data.test_mask], average='none', num_classes=3))
# print(int((pred[mask] == d.y[mask]).sum()) / int(mask.sum()))

(tensor([0.9854, 0.9052,    nan]), tensor([0.9878, 0.8885,    nan]))
(tensor([0.9575, 0.6786, 0.0000]), tensor([0.9877, 0.3684, 0.0000]))


In [133]:
class SkipGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        #self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.skip = torch.nn.Linear(in_channels,hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # x: Node feature matrix of shape [num_nodes, in_channels]
        # edge_index: Graph connectivity matrix of shape [2, num_edges]
        #x = F.dropout(x, p=0.5, training=self.training)
        s = self.skip(x)
        x = self.conv1(x, edge_index).relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        #x = self.conv2(x, edge_index).relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x+s, edge_index).softmax(dim=1)
        return x
skipmodel = SkipGCN(dataset.num_features, 100, dataset.num_classes+1).to(device)
optimizer = torch.optim.Adam(skipmodel.parameters(),lr=0.001) 
re2 = []
for epoch in range(1, 1000):
    loss = train(skipmodel,data)
    train_acc, tmp_test_acc = test(skipmodel,data)
    re2.append([epoch,loss,train_acc,tmp_test_acc])
    print(epoch,loss,train_acc,tmp_test_acc)


1 1.1381086111068726 0.5147521241720746 0.16946610677864427
2 1.0141297578811646 0.6288552886866929 0.2068386322735453
3 0.938466489315033 0.6976985348230414 0.23959208158368325
4 0.886393129825592 0.7397136549140295 0.2692861427714457
5 0.8479909896850586 0.7758413059476819 0.2954409118176365
6 0.8179726600646973 0.7998595035793136 0.32225554889022195
7 0.7951459288597107 0.8145447246939185 0.3504499100179964
8 0.7774631381034851 0.826420017394795 0.3689862027594481
9 0.7635196447372437 0.834950157222185 0.3864427114577085
10 0.7523097991943359 0.8407038201645816 0.4021595680863827
11 0.7430374026298523 0.8466581922793872 0.41745650869826034
12 0.7352127432823181 0.8518097277045561 0.4354529094181164
13 0.7285518050193787 0.8554893958653911 0.4518296340731854
14 0.722815990447998 0.8594701277848398 0.4683863227354529
15 0.7178969979286194 0.8622800561985683 0.48200359928014397
16 0.7136964201927185 0.8641867933364554 0.49508098380323934
17 0.7100780606269836 0.8664614972904262 0.50593

In [134]:
pred2 = skipmodel(data.x, data.edge_index).argmax(dim=-1)
from torchmetrics.functional import precision_recall
print(precision_recall(pred2[data.train_mask], data.y[data.train_mask], average='none', num_classes=3))
print(precision_recall(pred2[data.test_mask], data.y[data.test_mask], average='none', num_classes=3))

(tensor([0.9902, 0.9167,    nan]), tensor([0.9890, 0.9255,    nan]))
(tensor([0.9658, 0.4176, 0.0000]), tensor([0.9464, 0.5171, 0.0000]))


In [143]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
class DIVYANet(torch.nn.Module):
    def __init__(self):
        super(DIVYANet, self).__init__()
        
        self.conv1 = GCNConv(165, 128)
        self.conv2 = GCNConv(128, 128)
        self.conv3 = GCNConv(64, 64)
        self.conv4 = GCNConv(128, 1) 

    def forward(self, data, adj=None):
        edge_index = data.edge_index
        x = self.conv1(data.x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.1, training=self.training)
        x = self.conv4(x, edge_index)

        return F.sigmoid(x)
DIVYAmodel = DIVYANet().to(device)
#DIVYAmodel.double()
optimizer = torch.optim.Adam(DIVYAmodel.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.BCELoss()
DIVYAmodel.train()
for epoch in range(70):
    optimizer.zero_grad()
    out = DIVYAmodel(data)
    # data_train.y.unsqueeze(1)
    out = out.reshape((data.x.shape[0]))
    loss = criterion(out[data.train_mask], data.y[data.train_mask].float())
    auc = roc_auc_score(data.y.detach().cpu().numpy()[data.train_mask], out.detach().cpu().numpy()[data.train_mask]) #[train_idx]
    loss.backward()
    optimizer.step()
    if epoch%5 == 0:
      print("epoch: {} - loss: {} - roc: {}".format(epoch, loss.item(), auc))



epoch: 0 - loss: 0.7264694571495056 - roc: 0.4612517198574492




epoch: 5 - loss: 0.2797970175743103 - roc: 0.9004542126256988




epoch: 10 - loss: 0.24465961754322052 - roc: 0.9223821109734467




epoch: 15 - loss: 0.21319642663002014 - roc: 0.9372741771873248




epoch: 20 - loss: 0.1978733241558075 - roc: 0.9425134751672604




epoch: 25 - loss: 0.18447040021419525 - roc: 0.9489058524373236




epoch: 30 - loss: 0.17492778599262238 - roc: 0.9534096376099274




epoch: 35 - loss: 0.1674516350030899 - roc: 0.9568833332983637




epoch: 40 - loss: 0.16095851361751556 - roc: 0.9599251849988739




epoch: 45 - loss: 0.15434284508228302 - roc: 0.9628311517873754




epoch: 50 - loss: 0.14860326051712036 - roc: 0.964872714812359




epoch: 55 - loss: 0.14361512660980225 - roc: 0.9669783982057706




epoch: 60 - loss: 0.1385095864534378 - roc: 0.9688327581678913




epoch: 65 - loss: 0.13490982353687286 - roc: 0.9707581778139831




In [144]:
pred3 = DIVYAmodel(data).argmax(dim=-1)
print(precision_recall(pred3[data.train_mask], data.y[data.train_mask], average='none', num_classes=3))
print(precision_recall(pred3[data.test_mask], data.y[data.test_mask], average='none', num_classes=3))

(tensor([0.8842, 0.0000,    nan]), tensor([1., 0., nan]))
(tensor([0.9350, 0.0000,    nan]), tensor([1., 0., nan]))




## Generate New Centrality Feature

In [13]:
time_steps = list(range(1,17))
graphs = []
for time_step in time_steps:
    extract_nodes = list(set(elliptic_features[elliptic_features['time_step']==time_step]['txId'].values.tolist()))
    edgelist_extract = elliptic_edgelist[elliptic_edgelist['txId1'].isin(extract_nodes) & elliptic_edgelist['txId2'].isin(extract_nodes)].values.tolist()
    edgelist = [tuple(row) for row in edgelist_extract]
    G = nx.DiGraph()
    G.add_edges_from(edgelist)
    graphs.append(G)
    centrality = nx.in_degree_centrality(G)
    print(len(centrality.keys()))
    for id in centrality.keys():
        elliptic_features.loc[(elliptic_features["txId"] == id) &(elliptic_features['time_step']==time_step) ,"centrality"] = centrality[id]
    #elliptic_features = pd.merge(elliptic_features,subset[["txId","time_step","new"]],on = ["txId","time_step"],how="left")
    #print(time_step,len(subset),len(centrality))
extract_nodes = list(set(elliptic_features[elliptic_features['time_step'].isin(time_steps)]['txId'].values.tolist()))

7880
4544
6621
5693
6803
4328
6048
4457
4996
6727
4296
2047
4528
2022
3639
2975


In [46]:
new = dataset[0]
new.x = elliptic_features
new.x = torch.Tensor(new.x[new.x.columns[2:]].values.astype(np.float32))
new.x = torch.nan_to_num(new.x, nan=0)
modelnew = GCN(dataset.num_features+1, 16, dataset.num_classes+1)

In [53]:
randomize = torch.FloatTensor(data.y.shape[0]).uniform_()
new.train_mask = torch.logical_and(data.y > 0,randomize>0.2)# 80% of labeled
new.test_mask = torch.logical_and(data.y > 0,randomize< 0.2)# rest 20% of labeled
modelnew, new = modelnew.to(device), new.to(device)
best_val_acc = final_test_acc = 0
train_acc = tmp_test_acc = 0
for epoch in range(1, 21):
    loss = train(modelnew,new)
    train_acc, tmp_test_acc = test(modelnew,new)
    print(epoch,loss,train_acc,tmp_test_acc)

1 1.5124796628952026 0.046919845586130604 0.04648013051374396
2 1.5283610820770264 0.046919845586130604 0.04648013051374396
3 1.5279396772384644 0.046919845586130604 0.04648013051374396
4 1.5333728790283203 0.046919845586130604 0.04648013051374396
5 1.518808126449585 0.046919845586130604 0.04648013051374396
6 1.5292044878005981 0.046919845586130604 0.04648013051374396
7 1.522578477859497 0.046919845586130604 0.04648013051374396
8 1.5254406929016113 0.046919845586130604 0.04648013051374396
9 1.5326306819915771 0.046919845586130604 0.04648013051374396
10 1.5192081928253174 0.046919845586130604 0.04648013051374396
11 1.5253543853759766 0.046919845586130604 0.04648013051374396
12 1.5113784074783325 0.046919845586130604 0.04648013051374396
13 1.5392752885818481 0.046919845586130604 0.04648013051374396
14 1.5097583532333374 0.046919845586130604 0.04648013051374396
15 1.5169997215270996 0.046919845586130604 0.04648013051374396


KeyboardInterrupt: 