In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Graph_Neural_Network')

In [3]:
!pip install torch_geometric==2.5.0
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from torch_geometric.data import Data
from torch_geometric.nn import TAGConv
from sklearn.model_selection import train_test_split

Collecting torch_geometric==2.5.0
  Downloading torch_geometric-2.5.0-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.5.0


In [4]:
df_classes = pd.read_csv('./data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
df_edges = pd.read_csv('./data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
df_features = pd.read_csv('./data/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)

In [5]:
# 删除 unknown 类别，仅保留合法/非法交易（1/2）
df_classes = df_classes[df_classes['class'] != 'unknown']
df_classes['class'] = df_classes['class'].map({'1': 1, '2': 0})  # 1: illicit, 0: licit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_classes['class'] = df_classes['class'].map({'1': 1, '2': 0})  # 1: illicit, 0: licit


In [6]:
# 合并特征与类别标签
df_merge = df_features.merge(df_classes, how='inner', right_on="txId", left_on=0)
df_merge = df_merge.drop(['txId'], axis=1)

In [7]:
# ====================== 图构建 ======================
nodes = df_merge[0].values
map_id = {j: i for i, j in enumerate(nodes)}

edges = df_edges[df_edges.txId1.isin(map_id) & df_edges.txId2.isin(map_id)].copy()
edges.txId1 = edges.txId1.map(map_id)
edges.txId2 = edges.txId2.map(map_id)
edges = edges.astype(int)
edge_index = np.array(edges.values).T
edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
weights = torch.tensor([1] * edge_index.shape[1], dtype=torch.float32)

labels = torch.tensor(df_merge['class'].values, dtype=torch.float32)
node_features = torch.tensor(np.array(df_merge.drop([0, 'class', 1], axis=1).values), dtype=torch.float32)

elliptic_dataset = Data(x=node_features, edge_index=edge_index, edge_weights=weights, y=labels)

In [8]:
# ====================== 超参数与划分 ======================
seed = 0
# 所有节点都是有标签样本，划分训练/验证/测试集
all_idx = np.arange(len(labels))
y_all = labels[all_idx]

# 按70:15:15的比例划分训练集、验证集、测试集
train_idx, temp_idx = train_test_split(all_idx, test_size=0.3, random_state=seed, stratify=y_all)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=seed, stratify=labels[temp_idx])

elliptic_dataset.train_idx = torch.tensor(train_idx, dtype=torch.long)
elliptic_dataset.val_idx = torch.tensor(val_idx, dtype=torch.long)
elliptic_dataset.test_idx = torch.tensor(test_idx, dtype=torch.long)

In [9]:
# ----------------- 模型定义 -----------------
class TGNN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out, K=3):
        super().__init__()
        self.norm1 = torch.nn.BatchNorm1d(dim_in)
        self.gat1 = TAGConv(dim_in, dim_h, K)
        self.norm2 = torch.nn.BatchNorm1d(dim_h)
        self.gat2 = TAGConv(dim_h, dim_out, K)

    def forward(self, x, edge_index):
        h = self.norm1(x)
        h = self.gat1(h, edge_index)
        h = self.norm2(h)
        h = F.leaky_relu(h)
        out = self.gat2(h, edge_index)
        return out

def train(model, data, criterion, optimizer, num_epochs, checkpoint_dir, model_filename):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    best_loss = float('inf')
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        pred = model(data.x, data.edge_index)
        loss = criterion(pred[data.train_idx], data.y[data.train_idx].unsqueeze(1))
        loss.backward()
        optimizer.step()

        val_loss = criterion(pred[data.val_idx], data.y[data.val_idx].unsqueeze(1))
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save({'state_dict': model.state_dict()}, os.path.join(checkpoint_dir, model_filename))
    # 加载最佳模型权重
    checkpoint = torch.load(os.path.join(checkpoint_dir, model_filename))
    model.load_state_dict(checkpoint['state_dict'])
    return model

def test(model, data):
    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index).squeeze(1)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long().cpu().numpy()
        labels = data.y.cpu().numpy()
    return preds, labels

def evaluate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_true, y_pred)
    return acc, prec, rec, f1, mcc

In [13]:
# ----------------- 消融实验循环 -----------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = 165
hidden_size = 150
output_dim = 1
learning_rate = 0.005
weight_decay = 1e-5
num_epochs = 400
checkpoints_dir = './result/models/elliptic_tgnn_ablation'

# 把数据转device
data_train = elliptic_dataset.to(device)


results = []
for K_hop in range(1, 6):  # 从1到5
    print(f'\n==== Training TAGCN with K={K_hop} ====')
    model = TGNN(input_dim, hidden_size, output_dim, K=K_hop).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.BCEWithLogitsLoss()

    # model = train(model, data_train, criterion, optimizer, num_epochs,
    #               checkpoints_dir, f'tgnn_best_model_K{K_hop}.pth.tar')
    model.load_state_dict(torch.load(os.path.join(checkpoints_dir, 'tgnn_best_model_K%s.pth.tar' % K_hop))['state_dict'])


    preds, labels = test(model, data_train)
    # 只选测试集节点
    test_mask = elliptic_dataset.test_idx.cpu().numpy()
    y_true = labels[test_mask]
    y_pred = preds[test_mask]

    acc, prec, rec, f1, mcc = evaluate_metrics(y_true, y_pred)
    print(f'K={K_hop} Test Metrics: Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} F1={f1:.4f} MCC={mcc:.4f}')
    results.append({
        'K-hop': K_hop,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1,
        'MCC': mcc
    })

# 转成DataFrame展示
df_results = pd.DataFrame(results)
print('\n===== Ablation Study Results =====')
print(df_results.to_string(index=False))


==== Training TAGCN with K=1 ====
K=1 Test Metrics: Acc=0.9801 Prec=0.9429 Rec=0.8475 F1=0.8927 MCC=0.8833

==== Training TAGCN with K=2 ====
K=2 Test Metrics: Acc=0.9802 Prec=0.9359 Rec=0.8563 F1=0.8943 MCC=0.8845

==== Training TAGCN with K=3 ====
K=3 Test Metrics: Acc=0.9814 Prec=0.9423 Rec=0.8622 F1=0.9005 MCC=0.8913

==== Training TAGCN with K=4 ====
K=4 Test Metrics: Acc=0.9805 Prec=0.9403 Rec=0.8548 F1=0.8955 MCC=0.8861

==== Training TAGCN with K=5 ====
K=5 Test Metrics: Acc=0.9795 Prec=0.9178 Rec=0.8680 F1=0.8922 MCC=0.8814

===== Ablation Study Results =====
 K-hop  Accuracy  Precision   Recall  F1-score      MCC
     1  0.980100   0.942904 0.847507  0.892664 0.883255
     2  0.980243   0.935897 0.856305  0.894334 0.884522
     3  0.981389   0.942308 0.862170  0.900459 0.891286
     4  0.980530   0.940323 0.854839  0.895545 0.886058
     5  0.979528   0.917829 0.868035  0.892238 0.881356


In [14]:
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd

# 只保留 Recall, F1-score, MCC 三个指标
df_long = df_results.melt(
    id_vars=['K-hop'],
    value_vars=['Recall', 'F1-score', 'MCC'],
    var_name='Metric',
    value_name='Score'
)

# 折线图绘制
fig = px.line(
    df_long,
    x='K-hop',
    y='Score',
    color='Metric',
    markers=True,
    line_shape='linear'
)

# 控制图片大小（宽度和高度）
fig.update_layout(
    width=1000,     # 控制图宽度，例如800像素
    height=500,    # 控制图高度
    xaxis=dict(
        title='K-hop',
        dtick=1,
        titlefont=dict(size=16),
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title='Metric Value',
        range=[0.84, 0.91],
        titlefont=dict(size=16),
        tickfont=dict(size=14)
    ),
    legend_title_text='Evaluation Metric',
    legend=dict(font=dict(size=14)),
    template='plotly_white'
)
fig.show()