In [1]:
import os.path as osp

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from torch_cluster import random_walk

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import NeighborSampler as RawNeighborSampler
from torch_geometric.nn import SAGEConv

EPS = 1e-15

In [2]:
from torch_geometric.datasets import EllipticBitcoinDataset
dataset = EllipticBitcoinDataset(root='./pytorch_input')
data=dataset[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#70/30 split, first 34 timestep, occurs at index 136265
newsplit =  torch.Tensor([True]*136265+[False]*67504)
data.train_mask = torch.logical_and(data.y <2,newsplit)
data.test_mask = torch.logical_and(data.y <2,torch.logical_not(newsplit))

In [35]:
class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1,
                                coalesced=False)[:, 1]

        neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(), ),
                                  dtype=torch.long)

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        return super().sample(batch)


train_loader = NeighborSampler(data.edge_index, sizes=[10, 10], batch_size=256,
                               shuffle=True, num_nodes=data.num_nodes)


class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels,aggr="mean"))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SAGE(data.num_node_features, hidden_channels=64, num_layers=3)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=5e-4)
x, edge_index = data.x.to(device), data.edge_index.to(device)
classweight = torch.FloatTensor([0.3,0.7,0])

def train():
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        #print(adjs)
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        print(out.shape)
        # out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        # pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        # neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        # loss = -pos_loss - neg_loss
        loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask],weight=classweight)
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    return accs
# def test():
#     model.eval()
#     out = model.full_forward(x, edge_index).cpu()
#     clf = LogisticRegression()
#     clf.fit(out[data.train_mask], data.y[data.train_mask])

#     train_acc = clf.score(out[data.train_mask], data.y[data.train_mask])
#     test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])

#     return train_acc, test_acc


for epoch in range(1,51):
    loss = train()
    train_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
          f'Train: {train_acc:.4f}, Test: {test_acc:.4f}')

torch.Size([768, 64])


IndexError: The shape of the mask [203769] at index 0 does not match the shape of the indexed tensor [768, 64] at index 0

In [28]:
from sklearn.metrics import roc_curve, confusion_matrix
out = model.full_forward(x, edge_index).cpu()
clf = LogisticRegression()
clf.fit(out[data.train_mask].detach().numpy(), data.y[data.train_mask])
pred = clf.predict(out[data.test_mask].detach().numpy())
print(confusion_matrix(data.y[data.test_mask],pred))

[[15504    83]
 [ 1062    21]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
from torch_geometric.loader import NeighborLoader
import copy
kwargs = {'batch_size': 1024, 'num_workers': 6, 'persistent_workers': True}
train_loader = NeighborLoader(data, input_nodes=data.train_mask,
                              num_neighbors=[25, 10], shuffle=True, **kwargs)

subgraph_loader = NeighborLoader(copy.copy(data), input_nodes=None,
                                 num_neighbors=[-1], shuffle=False, **kwargs)

# No need to maintain these features during evaluation:
del subgraph_loader.data.x, subgraph_loader.data.y
# Add global node index information.
subgraph_loader.data.num_nodes = data.num_nodes
subgraph_loader.data.n_id = torch.arange(data.num_nodes)

classweight = torch.FloatTensor([0.3,0.7])
class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        # self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < len(self.convs) - 1:
                x = x.relu_()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    @torch.no_grad()
    def inference(self, x_all, subgraph_loader):
        #pbar = tqdm(total=len(subgraph_loader.dataset) * len(self.convs))
        #pbar.set_description('Evaluating')

        # Compute representations of nodes layer by layer, using *all*
        # available edges. This leads to faster computation in contrast to
        # immediately computing the final representations of each batch:
        for i, conv in enumerate(self.convs):
            xs = []
            for batch in subgraph_loader:
                x = x_all[batch.n_id.to(x_all.device)].to(device)
                x = conv(x, batch.edge_index.to(device))
                if i < len(self.convs) - 1:
                    x = x.relu_()
                xs.append(x[:batch.batch_size].cpu())
                #pbar.update(batch.batch_size)
            x_all = torch.cat(xs, dim=0)
        #pbar.close()
        return x_all


model = SAGE(dataset.num_features, 256, dataset.num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01,weight_decay=5e-5)


def train(epoch):
    model.train()

    #pbar = tqdm(total=int(len(train_loader.dataset)))
    #pbar.set_description(f'Epoch {epoch:02d}')

    total_loss = total_correct = total_examples = 0
    for batch in train_loader:
        optimizer.zero_grad()
        y = batch.y[:batch.batch_size]
        y_hat = model(batch.x, batch.edge_index.to(device))[:batch.batch_size]
        loss = F.cross_entropy(y_hat, y,weight=classweight)
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * batch.batch_size
        total_correct += int((y_hat.argmax(dim=-1) == y).sum())
        total_examples += batch.batch_size
        #pbar.update(batch.batch_size)
    #pbar.close()

    return total_loss / total_examples, total_correct / total_examples


@torch.no_grad()
def test():
    model.eval()
    y_hat = model.inference(data.x, subgraph_loader).argmax(dim=-1)
    y = data.y.to(y_hat.device)

    accs = []
    for mask in [data.train_mask, data.test_mask]:
        accs.append(int((y_hat[mask] == y[mask]).sum()) / int(mask.sum()))
    return accs


for epoch in range(1, 51):
    loss, acc = train(epoch)
    if epoch % 5 == 3:
        print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {acc:.4f}')
    train_acc, test_acc = test()
    if epoch % 5 == 0:
        print(f'Epoch: {epoch:02d}, Train: {train_acc:.4f},'
            f'Test: {test_acc:.4f}')

Epoch 03, Loss: 0.1767, Approx. Train: 0.9610
Epoch: 05, Train: 0.9799,Test: 0.9630
Epoch 08, Loss: 0.1286, Approx. Train: 0.9689
Epoch: 10, Train: 0.9841,Test: 0.9557
Epoch 13, Loss: 0.1141, Approx. Train: 0.9732
Epoch: 15, Train: 0.9875,Test: 0.9647
Epoch 18, Loss: 0.1110, Approx. Train: 0.9728
Epoch: 20, Train: 0.9871,Test: 0.9624
Epoch 23, Loss: 0.1837, Approx. Train: 0.9726
Epoch: 25, Train: 0.9884,Test: 0.9523
Epoch 28, Loss: 0.0943, Approx. Train: 0.9787
Epoch: 30, Train: 0.9908,Test: 0.9650
Epoch 33, Loss: 0.0907, Approx. Train: 0.9798
Epoch: 35, Train: 0.9911,Test: 0.9652
Epoch 38, Loss: 0.0676, Approx. Train: 0.9824
Epoch: 40, Train: 0.9910,Test: 0.9639
Epoch 43, Loss: 0.0824, Approx. Train: 0.9803
Epoch: 45, Train: 0.9897,Test: 0.9657
Epoch 48, Loss: 0.0790, Approx. Train: 0.9795
Epoch: 50, Train: 0.9913,Test: 0.9655


In [5]:
from sklearn.metrics import roc_curve, confusion_matrix
y_hat = model.inference(data.x, subgraph_loader).argmax(dim=-1)
print(confusion_matrix(data.y[data.test_mask],y_hat[data.test_mask]))

[[15490    97]
 [  478   605]]


In [6]:
import pandas as pd
pd.Series(data.y[data.test_mask]).value_counts()

0    15587
1     1083
dtype: int64

In [8]:
1-(97+478)/(15587+1083)

0.9655068986202759