In [79]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader

### Load and prepare data

In [80]:
# Load the elliptic dataset
data_dir = 'data/elliptic_bitcoin_dataset'

node_features = pd.read_csv(f'{data_dir}/elliptic_txs_features.csv', header=None)
edges = pd.read_csv(f'{data_dir}/elliptic_txs_edgelist.csv')
classes = pd.read_csv(f'{data_dir}/elliptic_txs_classes.csv')

In [81]:
node_features = node_features.set_index(0)
classes = classes.set_index('txId')

# only keep the features that are labeled
labeled_data = classes[classes['class'] != 'unknown']
labeled_data['class'] = labeled_data['class'].map({'1': 1, '2': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['class'] = labeled_data['class'].map({'1': 1, '2': 0})


In [82]:
tx_map = {tx: i for i, tx in enumerate(sorted(labeled_data.index))}
reversed_tx_map = {i: tx for tx, i in tx_map.items()}

In [83]:
_nodes = set(labeled_data.index)
edges = edges[(edges.txId1.isin(_nodes)) & (edges.txId2.isin(_nodes))]
node_features = node_features.loc[list(_nodes)]

In [84]:
# reset indices to 0-based integers
labeled_data.index = labeled_data.index.map(tx_map)
edges['txId1'] = edges['txId1'].map(tx_map)
edges['txId2'] = edges['txId2'].map(tx_map)
node_features.index = node_features.index.map(tx_map)

In [85]:
labeled_data['class'].value_counts()

class
0    42019
1     4545
Name: count, dtype: int64

# Traditional modeling

In [86]:
train, test = train_test_split(
    labeled_data,
    test_size=0.2,
    stratify=labeled_data['class'], 
    random_state=42
)
train.shape, test.shape

((37251, 1), (9313, 1))

In [87]:
scaler = StandardScaler()

train_df = node_features.loc[train.index]
train_df = pd.DataFrame(
    scaler.fit_transform(train_df),
    index=train_df.index,
    columns=train_df.columns
)

test_df = node_features.loc[test.index]
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index=test_df.index,
    columns=test_df.columns
)

In [98]:
clf = RandomForestClassifier()
clf.fit(train_df, train['class'])

In [None]:
metrics.f1_score(train['class'], clf.predict(train_df)), \
metrics.f1_score(test['class'], clf.predict(test_df))

(1.0, 0.9872221625684527)

## Graph modeling

In [90]:
# Create graph
edge_index = torch.tensor(edges.values.T, dtype=torch.long)
x = torch.tensor(
    node_features.loc[labeled_data.index].values,
    dtype=torch.float
)
y = torch.tensor(
    labeled_data['class'].values,
    dtype=torch.long
)

data = Data(x=x, edge_index=edge_index, y=y)

In [91]:
# Split data into train and test sets
train_mask = torch.rand(len(y)) < 0.8
test_mask = ~train_mask

data.train_mask = train_mask
data.test_mask = test_mask

In [92]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [93]:
model = GCN(input_dim=x.size(1), hidden_dim=16, output_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [94]:
%%time

model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 2.371016502380371
Epoch 2, Loss: 0.7351816296577454
Epoch 3, Loss: 0.3892441689968109
Epoch 4, Loss: 0.4281013309955597
Epoch 5, Loss: 0.5008909106254578
Epoch 6, Loss: 0.5626734495162964
Epoch 7, Loss: 0.6069450974464417
Epoch 8, Loss: 0.6337854862213135
Epoch 9, Loss: 0.6450944542884827
Epoch 10, Loss: 0.643189549446106
Epoch 11, Loss: 0.6305001378059387
Epoch 12, Loss: 0.609434962272644
Epoch 13, Loss: 0.5820143818855286
Epoch 14, Loss: 0.550147294998169
Epoch 15, Loss: 0.5157629251480103
Epoch 16, Loss: 0.48056715726852417
Epoch 17, Loss: 0.44640183448791504
Epoch 18, Loss: 0.4151143729686737
Epoch 19, Loss: 0.38844260573387146
Epoch 20, Loss: 0.36757948994636536
Epoch 21, Loss: 0.3529711067676544
Epoch 22, Loss: 0.34415408968925476
Epoch 23, Loss: 0.3395500183105469
Epoch 24, Loss: 0.33690616488456726
Epoch 25, Loss: 0.33374691009521484
Epoch 26, Loss: 0.3283871114253998
Epoch 27, Loss: 0.32054659724235535
Epoch 28, Loss: 0.31121590733528137
Epoch 29, Loss: 0.301940

In [96]:
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
accuracy = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9289
