In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.loader import LinkNeighborLoader

from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device, "torch:", torch.__version__, "cuda:", torch.version.cuda)


  from .autonotebook import tqdm as notebook_tqdm


device: cuda torch: 2.5.1 cuda: 12.1


In [2]:
# change if needed
CSV_FILE = "HI-Small_Trans.csv"

df = pd.read_csv(CSV_FILE)
print(df.shape)
df.head()


(5078345, 11)


Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [3]:
df["ts"] = pd.to_datetime(df["Timestamp"], errors="coerce")
df = df.dropna(subset=["ts"]).sort_values("ts").reset_index(drop=True)

# Rename for sanity
df = df.rename(columns={
    "Account": "src_acct",
    "Account.1": "dst_acct",
    "From Bank": "src_bank",
    "To Bank": "dst_bank",
    "Amount Paid": "amt_paid",
    "Amount Received": "amt_recv",
    "Payment Currency": "pay_ccy",
    "Receiving Currency": "recv_ccy",
    "Payment Format": "pay_fmt",
    "Is Laundering": "y"
})

# ensure labels 0/1 int
df["y"] = df["y"].astype(int)

# time split
cut = int(len(df) * 0.8)
train_df = df.iloc[:cut].copy()
test_df  = df.iloc[cut:].copy()

print("train pos rate:", train_df["y"].mean())
print("test  pos rate:", test_df["y"].mean())
print("rows train/test:", len(train_df), len(test_df))


train pos rate: 0.000831963956761504
test  pos rate: 0.001769277195621802
rows train/test: 4062676 1015669


In [4]:
# Map account IDs to contiguous node ids
all_accts = pd.Index(pd.concat([df["src_acct"], df["dst_acct"]], ignore_index=True).astype(str).unique())
acct2id = {a:i for i,a in enumerate(all_accts)}
num_nodes = len(all_accts)
print("num_nodes:", num_nodes)

def make_edge_table(dfx: pd.DataFrame):
    src = dfx["src_acct"].astype(str).map(acct2id).astype(np.int64).to_numpy()
    dst = dfx["dst_acct"].astype(str).map(acct2id).astype(np.int64).to_numpy()
    y   = dfx["y"].astype(np.int64).to_numpy()

    # numeric amount feature(s) (log helps)
    amt = dfx["amt_paid"].astype(float).to_numpy()
    amt = np.log1p(np.clip(amt, 0, None))  # log(1+amt)

    # categorical -> codes (fast, compact)
    pay_ccy = dfx["pay_ccy"].astype(str).astype("category").cat.codes.to_numpy()
    recv_ccy = dfx["recv_ccy"].astype(str).astype("category").cat.codes.to_numpy()
    pay_fmt = dfx["pay_fmt"].astype(str).astype("category").cat.codes.to_numpy()

    # stack edge features (float32)
    edge_attr = np.stack([amt, pay_ccy, recv_ccy, pay_fmt], axis=1).astype(np.float32)
    return src, dst, y, edge_attr

src_tr, dst_tr, y_tr, eattr_tr = make_edge_table(train_df)
src_te, dst_te, y_te, eattr_te = make_edge_table(test_df)

print("train edges:", len(y_tr), "test edges:", len(y_te))
print("edge_attr dim:", eattr_tr.shape[1])


num_nodes: 515080
train edges: 4062676 test edges: 1015669
edge_attr dim: 4


In [5]:
# Graph edges from TRAIN only
edge_index_train = torch.tensor(
    np.stack([src_tr, dst_tr], axis=0),
    dtype=torch.long
)

data = Data(edge_index=edge_index_train, num_nodes=num_nodes)

# For link prediction, we provide edge_label_index + edge_label
edge_label_index_tr = torch.tensor(np.stack([src_tr, dst_tr], axis=0), dtype=torch.long)
edge_label_tr = torch.tensor(y_tr, dtype=torch.float32)

edge_label_index_te = torch.tensor(np.stack([src_te, dst_te], axis=0), dtype=torch.long)
edge_label_te = torch.tensor(y_te, dtype=torch.float32)

# Neighbor sampling sizes: keep small to avoid OOM
num_neighbors = [10, 5]
batch_size = 4096

train_loader = LinkNeighborLoader(
    data=data,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    edge_label_index=edge_label_index_tr,
    edge_label=edge_label_tr,
    shuffle=True,
)

test_loader = LinkNeighborLoader(
    data=data,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    edge_label_index=edge_label_index_te,
    edge_label=edge_label_te,
    shuffle=False,
)


In [6]:
from torch_geometric.nn import SAGEConv

class SAGEEdgeClassifier(nn.Module):
    def __init__(self, num_nodes, emb_dim=64, hidden=64, edge_feat_dim=4):
        super().__init__()
        self.node_emb = nn.Embedding(num_nodes, emb_dim)

        self.conv1 = SAGEConv(emb_dim, hidden)
        self.conv2 = SAGEConv(hidden, emb_dim)

        # edge scorer: [h_u, h_v, |diff|, edge_feat] -> logit
        in_dim = emb_dim*3 + edge_feat_dim
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
        )

    def encode(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = F.relu(h)
        h = self.conv2(h, edge_index)
        return h

    def forward(self, batch, edge_feat):
        # batch has n_id (global node ids for this subgraph)
        x = self.node_emb(batch.n_id)
        h = self.encode(x, batch.edge_index)

        # edge_label_index is LOCAL indices into batch nodes
        src = batch.edge_label_index[0]
        dst = batch.edge_label_index[1]

        hs = h[src]
        hd = h[dst]
        feat = torch.cat([hs, hd, torch.abs(hs - hd), edge_feat], dim=1)
        return self.mlp(feat).view(-1)

model = SAGEEdgeClassifier(num_nodes=num_nodes, emb_dim=64, hidden=64, edge_feat_dim=eattr_tr.shape[1]).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

# pos_weight for BCE: (neg/pos)
pos = y_tr.sum()
neg = len(y_tr) - pos
pos_weight = torch.tensor([neg / (pos + 1e-9)], dtype=torch.float32, device=device)
crit = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
print("pos_weight:", pos_weight.item())


pos_weight: 1200.97509765625


In [7]:
def build_key2idx(src, dst):
    # key as 64-bit packed int to avoid python tuple overhead
    # pack: (src << 32) | dst  (works if node ids < 2^32)
    key = (src.astype(np.int64) << 32) | dst.astype(np.int64)
    return {int(k): i for i, k in enumerate(key)}

train_key2idx = build_key2idx(src_tr, dst_tr)
test_key2idx  = build_key2idx(src_te, dst_te)

eattr_tr_t = torch.tensor(eattr_tr, dtype=torch.float32, device=device)
eattr_te_t = torch.tensor(eattr_te, dtype=torch.float32, device=device)

def get_edge_feat_for_batch(batch, key2idx, edge_attr_tensor):
    # local edge endpoints -> global node ids
    src_local = batch.edge_label_index[0]
    dst_local = batch.edge_label_index[1]
    src_global = batch.n_id[src_local].detach().cpu().numpy().astype(np.int64)
    dst_global = batch.n_id[dst_local].detach().cpu().numpy().astype(np.int64)

    keys = (src_global << 32) | dst_global
    idxs = [key2idx[int(k)] for k in keys]
    return edge_attr_tensor[idxs]


In [8]:
from tqdm.auto import tqdm
import numpy as np

EPOCHS = 5

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"train epoch {epoch}"):
        batch = batch.to(device)
        edge_feat = get_edge_feat_for_batch(batch, train_key2idx, eattr_tr_t)

        logits = model(batch, edge_feat)
        loss = crit(logits, batch.edge_label.to(device))

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += float(loss.item())

    # eval
    model.eval()
    probs, ys = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"eval epoch {epoch}"):
            batch = batch.to(device)
            edge_feat = get_edge_feat_for_batch(batch, test_key2idx, eattr_te_t)

            logit = model(batch, edge_feat)
            p = torch.sigmoid(logit).detach().cpu().numpy()
            y = batch.edge_label.detach().cpu().numpy()

            probs.append(p)
            ys.append(y)

    y_true = np.concatenate(ys)
    y_prob = np.concatenate(probs)
    pr_auc = average_precision_score(y_true, y_prob)

    print(f"epoch {epoch:02d} | loss {total_loss:.4f} | PR-AUC {pr_auc:.6f}")


train epoch 1: 100%|█████████████████████████████████████████████████████████████████| 992/992 [00:45<00:00, 21.70it/s]
eval epoch 1: 100%|██████████████████████████████████████████████████████████████████| 248/248 [00:05<00:00, 46.30it/s]


epoch 01 | loss 620.1835 | PR-AUC 0.049736


train epoch 2: 100%|█████████████████████████████████████████████████████████████████| 992/992 [00:48<00:00, 20.53it/s]
eval epoch 2: 100%|██████████████████████████████████████████████████████████████████| 248/248 [00:05<00:00, 45.21it/s]


epoch 02 | loss 367.1571 | PR-AUC 0.076606


train epoch 3: 100%|█████████████████████████████████████████████████████████████████| 992/992 [00:49<00:00, 20.17it/s]
eval epoch 3: 100%|██████████████████████████████████████████████████████████████████| 248/248 [00:05<00:00, 45.24it/s]


epoch 03 | loss 222.7732 | PR-AUC 0.105318


train epoch 4: 100%|█████████████████████████████████████████████████████████████████| 992/992 [00:48<00:00, 20.51it/s]
eval epoch 4: 100%|██████████████████████████████████████████████████████████████████| 248/248 [00:05<00:00, 43.11it/s]


epoch 04 | loss 142.6221 | PR-AUC 0.124013


train epoch 5: 100%|█████████████████████████████████████████████████████████████████| 992/992 [00:48<00:00, 20.45it/s]
eval epoch 5: 100%|██████████████████████████████████████████████████████████████████| 248/248 [00:05<00:00, 45.26it/s]


epoch 05 | loss 94.6748 | PR-AUC 0.127497


In [9]:
# Pull node embeddings (fast and stable)
H = model.node_emb.weight.detach().cpu().numpy()  # (num_nodes, emb_dim)

def build_hybrid_X(src, dst, edge_attr, H):
    Hs = H[src]
    Hd = H[dst]
    X = np.hstack([
        Hs, Hd,
        np.abs(Hs - Hd),
        Hs * Hd,
        edge_attr
    ]).astype(np.float32)
    return X

X_train = build_hybrid_X(src_tr, dst_tr, eattr_tr, H)
X_test  = build_hybrid_X(src_te, dst_te, eattr_te, H)

y_train = y_tr.astype(int)
y_test  = y_te.astype(int)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("pos rate train/test:", y_train.mean(), y_test.mean())


X_train: (4062676, 260) X_test: (1015669, 260)
pos rate train/test: 0.000831963956761504 0.001769277195621802


In [10]:
# sample weights: upweight positives
w_pos = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-9)
sample_weight = np.where(y_train == 1, w_pos, 1.0).astype(np.float32)

hgb = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.1,
    max_iter=300,
)
hgb.fit(X_train, y_train, sample_weight=sample_weight)

p = hgb.predict_proba(X_test)[:, 1]
pr_auc = average_precision_score(y_test, p)
print("Hybrid (HGB weighted) PR-AUC:", pr_auc)

# Precision@k / Recall@k at top 1%
k = int(0.01 * len(p))
top = np.argsort(p)[-k:]
precision_at_1pct = y_test[top].mean()
recall_at_1pct = y_test[top].sum() / (y_test.sum() + 1e-9)
print("Precision@1%:", precision_at_1pct)
print("Recall@1%:", recall_at_1pct)


Hybrid (HGB weighted) PR-AUC: 0.13147641569564023
Precision@1%: 0.057207562032296176
Recall@1%: 0.32331663884233536


In [14]:
# ONE-CELL: build a results table for Edge-only HGB, GNN-only, and Hybrid
import numpy as np, pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, precision_score, recall_score

def eval_metrics(y_true, p, top_frac=0.01):
    y_true = np.asarray(y_true).astype(int)
    p = np.asarray(p).astype(float)

    pr_auc = average_precision_score(y_true, p)
    roc_auc = roc_auc_score(y_true, p) if len(np.unique(y_true)) > 1 else np.nan

    k = max(1, int(top_frac * len(p)))
    top_idx = np.argsort(p)[-k:]
    prec_at = y_true[top_idx].mean()
    rec_at = y_true[top_idx].sum() / (y_true.sum() + 1e-9)

    # threshold @0.5 (mostly for completeness; not great under heavy imbalance)
    y_pred = (p >= 0.5).astype(int)
    acc = (y_pred == y_true).mean()
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)

    return {
        "PR_AUC": pr_auc,
        "ROC_AUC": roc_auc,
        "Precision@1%": prec_at,
        "Recall@1%": rec_at,
        "Alerts@1%": k,
        "Accuracy@0.5": acc,
        "Prec@0.5": prec,
        "Rec@0.5": rec,
    }

rows = []

# ----- Edge-only baseline (HGB) -----
# expects: edge_attr_tr_np, edge_attr_te_np, y_train, y_test
try:
    X_edge_train = edge_attr_tr_np
    X_edge_test  = edge_attr_te_np
    y_tr = np.asarray(y_train).astype(int)
    y_te = np.asarray(y_test).astype(int)

    w_pos = (len(y_tr) - y_tr.sum()) / (y_tr.sum() + 1e-9)
    sample_weight = np.where(y_tr == 1, w_pos, 1.0).astype(np.float32)

    hgb_edge = HistGradientBoostingClassifier(max_depth=6, learning_rate=0.1, max_iter=300)
    hgb_edge.fit(X_edge_train, y_tr, sample_weight=sample_weight)
    p_edge = hgb_edge.predict_proba(X_edge_test)[:, 1]

    m = eval_metrics(y_te, p_edge)
    m["Model"] = "Edge-only (HGB)"
    rows.append(m)
except NameError as e:
    print("Edge-only skipped (missing vars):", e)

# ----- GNN-only -----
# expects: ys, probs from your GNN eval loop OR y_true_gnn, y_prob_gnn
try:
    if "y_true_gnn" in globals() and "y_prob_gnn" in globals():
        y_gnn = y_true_gnn
        p_gnn = y_prob_gnn
    else:
        y_gnn = np.concatenate(ys)
        p_gnn = np.concatenate(probs)

    m = eval_metrics(y_gnn, p_gnn)
    m["Model"] = "GNN-only"
    rows.append(m)
except NameError as e:
    print("GNN-only skipped (missing vars):", e)

# ----- Hybrid -----
# expects: p_hybrid and y_test (or y_te)
try:
    p_h = p_hybrid
    y_te = np.asarray(y_test).astype(int)

    m = eval_metrics(y_te, p_h)
    m["Model"] = "Hybrid"
    rows.append(m)
except NameError as e:
    print("Hybrid skipped (missing vars):", e)

results_df = pd.DataFrame(rows)[
    ["Model","PR_AUC","ROC_AUC","Precision@1%","Recall@1%","Alerts@1%","Accuracy@0.5","Prec@0.5","Rec@0.5"]
].sort_values("PR_AUC", ascending=False)

results_df


Edge-only skipped (missing vars): name 'edge_attr_tr_np' is not defined
Hybrid skipped (missing vars): name 'p_hybrid' is not defined


Unnamed: 0,Model,PR_AUC,ROC_AUC,Precision@1%,Recall@1%,Alerts@1%,Accuracy@0.5,Prec@0.5,Rec@0.5
0,GNN-only,0.127497,0.930089,0.06794,0.383973,10156,0.9371,0.014664,0.521981
