In [1]:
from google.colab import drive
drive.mount("/content/drive")

import os, shutil, time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm import trange


Mounted at /content/drive


In [2]:
BASE = "/content/drive/MyDrive"

HI_SMALL = f"{BASE}/HI-Small_Trans.csv"
HI_MED   = f"{BASE}/HI-Medium_Trans.csv"

def copy_to_local(src_path):
    dst_path = "/content/" + os.path.basename(src_path)
    if not os.path.exists(dst_path):
        t0 = time.time()
        shutil.copyfile(src_path, dst_path)
        print("Copied", os.path.basename(src_path), "->", dst_path,
              "GB:", round(os.path.getsize(dst_path)/1e9, 2),
              "sec:", round(time.time()-t0, 1))
    else:
        print("Using cached local:", dst_path)
    return dst_path

SMALL_LOCAL = copy_to_local(HI_SMALL)
MED_LOCAL   = copy_to_local(HI_MED)   # comment this out if you only want small first


Copied HI-Small_Trans.csv -> /content/HI-Small_Trans.csv GB: 0.48 sec: 7.3
Copied HI-Medium_Trans.csv -> /content/HI-Medium_Trans.csv GB: 3.03 sec: 38.0


In [3]:
usecols = ["From Bank","Account","To Bank","Account.1","Is Laundering"]

small = pd.read_csv(SMALL_LOCAL, usecols=usecols)
print("HI-Small rows:", len(small), "pos rate:", small["Is Laundering"].mean())
small.head(2)


HI-Small rows: 5078345 pos rate: 0.0010194266045335635


Unnamed: 0,From Bank,Account,To Bank,Account.1,Is Laundering
0,10,8000EBD30,10,8000EBD30,0
1,3208,8000F4580,1,8000F5340,0


In [4]:
src_df = small[["From Bank","Account"]]
dst_df = small[["To Bank","Account.1"]].rename(columns={"To Bank":"From Bank","Account.1":"Account"})
all_df = pd.concat([src_df, dst_df], ignore_index=True)

# factorize tuple keys
codes, uniques = pd.factorize(list(map(tuple, all_df.to_numpy())))
m = len(small)

src = codes[:m].astype(np.int64)
dst = codes[m:].astype(np.int64)
y   = small["Is Laundering"].astype(np.float32).to_numpy()

num_nodes = len(uniques)
print("nodes:", num_nodes, "edges:", m)


  codes, uniques = pd.factorize(list(map(tuple, all_df.to_numpy())))


nodes: 515088 edges: 5078345


In [5]:
import torch
print("cuda available:", torch.cuda.is_available())
print("device:", "cuda" if torch.cuda.is_available() else "cpu")
!nvidia-smi


cuda available: True
device: cuda
Mon Jan 26 01:58:19 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   43C    P8             12W /   72W |       3MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
              

In [6]:
# UPDATED EdgeMLP (dropout + slightly stronger MLP) + pos_weight support
# (same name/class so the rest of your notebook doesn't change)

import torch
import torch.nn as nn

class EdgeMLP(nn.Module):
    def __init__(self, n_nodes, d=128, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(n_nodes, d)
        self.drop = nn.Dropout(dropout)
        self.net = nn.Sequential(
            nn.Linear(4*d, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, s, t):
        Hs = self.drop(self.emb(s))
        Hd = self.drop(self.emb(t))
        x = torch.cat([Hs, Hd, (Hs - Hd).abs(), Hs * Hd], dim=1)
        return self.net(x).squeeze(-1)

print("EdgeMLP updated ‚úÖ")


EdgeMLP updated ‚úÖ


In [7]:
device = torch.device("cuda")
from tqdm import trange

In [8]:
pos_rate = y.mean()
neg_rate = 1 - pos_rate
pos_weight = torch.tensor([neg_rate / pos_rate], device=device)

print("pos_weight:", float(pos_weight))

loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


pos_weight: 979.943603515625


In [9]:
import torch
import torch.nn as nn
from tqdm import trange
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

device = torch.device("cuda")

# tensors
src_t = torch.from_numpy(src).long()
dst_t = torch.from_numpy(dst).long()
y_t   = torch.from_numpy(y).float()   # ‚úÖ FIX: float labels for BCE

# model
model = EdgeMLP(num_nodes, d=128).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

# move full arrays to GPU once (fast)
src_gpu = src_t.to(device, non_blocking=True)
dst_gpu = dst_t.to(device, non_blocking=True)
y_gpu   = y_t.to(device, non_blocking=True)

EPOCHS = 10

# start big; if OOM, it will automatically drop
BATCH_CANDIDATES = [262144, 131072, 65536, 32768]

def train_one_epoch(batch_size: int):
    model.train()
    perm = torch.randperm(len(src_t))  # CPU perm (safe)
    running = 0.0

    for step in trange(0, len(src_t), batch_size, desc=f"train bs={batch_size}"):
        idx = perm[step:step+batch_size].to(device, non_blocking=True)

        s = src_gpu[idx]
        t = dst_gpu[idx]
        yy = y_gpu[idx]

        opt.zero_grad(set_to_none=True)
        logits = model(s, t)
        loss = loss_fn(logits, yy)
        loss.backward()
        opt.step()

        running += loss.detach().item() * s.size(0)

    return running / len(src_t)

@torch.no_grad()
def quick_eval_sample(n=1_000_000):
    model.eval()
    n = min(n, len(src_t))
    idx = torch.randperm(len(src_t), device=device)[:n]
    p = torch.sigmoid(model(src_gpu[idx], dst_gpu[idx])).detach().cpu().numpy()
    yt = y_gpu[idx].detach().cpu().numpy()
    return roc_auc_score(yt, p), average_precision_score(yt, p), float(p.mean()), float(yt.mean())

# training loop with auto OOM fallback
for ep in range(EPOCHS):
    used_bs = None
    for bs in BATCH_CANDIDATES:
        try:
            avg_loss = train_one_epoch(bs)
            used_bs = bs
            break
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            print(f"OOM at batch={bs}, trying smaller...")
            continue

    if used_bs is None:
        raise RuntimeError("All batch sizes OOM. Something is wrong (or arrays are enormous).")

    roc, pr, pmean, ymean = quick_eval_sample(n=500_000)
    print(f"Epoch {ep:02d} | bs={used_bs} | avg_loss={avg_loss:.6f} | ROC={roc:.4f} | PR={pr:.4f} | pred_mean={pmean:.4f} | label_mean={ymean:.4f}")


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:03<00:00,  6.60it/s]


Epoch 00 | bs=262144 | avg_loss=0.141388 | ROC=0.4586 | PR=0.0009 | pred_mean=0.0000 | label_mean=0.0010


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 01 | bs=262144 | avg_loss=0.016266 | ROC=0.5345 | PR=0.0011 | pred_mean=0.0000 | label_mean=0.0011


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.97it/s]


Epoch 02 | bs=262144 | avg_loss=0.014505 | ROC=0.6085 | PR=0.0017 | pred_mean=0.0000 | label_mean=0.0011


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 03 | bs=262144 | avg_loss=0.010887 | ROC=0.6546 | PR=0.0020 | pred_mean=0.0002 | label_mean=0.0010


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 04 | bs=262144 | avg_loss=0.008635 | ROC=0.6891 | PR=0.0029 | pred_mean=0.0012 | label_mean=0.0010


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 05 | bs=262144 | avg_loss=0.008135 | ROC=0.7092 | PR=0.0039 | pred_mean=0.0017 | label_mean=0.0010


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 06 | bs=262144 | avg_loss=0.008056 | ROC=0.7120 | PR=0.0067 | pred_mean=0.0014 | label_mean=0.0010


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 07 | bs=262144 | avg_loss=0.008010 | ROC=0.7182 | PR=0.0049 | pred_mean=0.0014 | label_mean=0.0011


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.96it/s]


Epoch 08 | bs=262144 | avg_loss=0.007964 | ROC=0.7471 | PR=0.0267 | pred_mean=0.0014 | label_mean=0.0009


train bs=262144: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.95it/s]


Epoch 09 | bs=262144 | avg_loss=0.007897 | ROC=0.7655 | PR=0.0284 | pred_mean=0.0014 | label_mean=0.0011


In [10]:
import pandas as pd
import numpy as np
import torch
from tqdm import trange
from sklearn.metrics import roc_auc_score, average_precision_score

usecols = ["From Bank","Account","To Bank","Account.1","Is Laundering"]

MED_PATH = "/content/HI-Medium_Trans.csv"  # change if needed
med = pd.read_csv(MED_PATH, usecols=usecols)

print("HI-Medium rows:", len(med), "pos rate:", med["Is Laundering"].mean())


HI-Medium rows: 31898238 pos rate: 0.0011044497191349566


In [11]:
# uniques is what you got from pd.factorize on SMALL mapping step
node_to_id = {k:i for i,k in enumerate(uniques)}
print("node_to_id size:", len(node_to_id))


node_to_id size: 515088


In [12]:
src_keys = list(map(tuple, med[["From Bank","Account"]].to_numpy()))
dst_keys = list(map(tuple, med[["To Bank","Account.1"]].rename(
    columns={"To Bank":"From Bank","Account.1":"Account"}
).to_numpy()))

src_md = np.array([node_to_id.get(k, -1) for k in src_keys], dtype=np.int64)
dst_md = np.array([node_to_id.get(k, -1) for k in dst_keys], dtype=np.int64)
y_md   = med["Is Laundering"].astype(np.float32).to_numpy()

mask = (src_md >= 0) & (dst_md >= 0)
src_md, dst_md, y_md = src_md[mask], dst_md[mask], y_md[mask]

print("Overlap edges:", len(y_md), "pos rate:", y_md.mean())


Overlap edges: 7 pos rate: 0.0


In [13]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import trange
from sklearn.metrics import roc_auc_score, average_precision_score

device = torch.device("cuda")

# --- split indices ---
N = len(y)
rng = np.random.default_rng(0)
perm = rng.permutation(N)

val_frac = 0.1
n_val = int(N * val_frac)
val_idx_np = perm[:n_val]
tr_idx_np  = perm[n_val:]

print("train edges:", len(tr_idx_np), "val edges:", len(val_idx_np),
      "train pos:", y[tr_idx_np].mean(), "val pos:", y[val_idx_np].mean())

# --- tensors ---
src_t = torch.from_numpy(src).long()
dst_t = torch.from_numpy(dst).long()
y_t   = torch.from_numpy(y).float()

src_gpu = src_t.to(device, non_blocking=True)
dst_gpu = dst_t.to(device, non_blocking=True)
y_gpu   = y_t.to(device, non_blocking=True)

tr_idx = torch.from_numpy(tr_idx_np).long()
val_idx = torch.from_numpy(val_idx_np).long()

# --- model (same as before) ---
class EdgeMLP(nn.Module):
    def __init__(self, n_nodes, d=128, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(n_nodes, d)
        self.drop = nn.Dropout(dropout)
        self.net = nn.Sequential(
            nn.Linear(4*d, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(128, 1),
        )

    def forward(self, s, t):
        Hs = self.drop(self.emb(s))
        Hd = self.drop(self.emb(t))
        x = torch.cat([Hs, Hd, (Hs - Hd).abs(), Hs * Hd], dim=1)
        return self.net(x).squeeze(-1)

model = EdgeMLP(num_nodes, d=128, dropout=0.2).to(device)

# --- pos_weight from TRAIN ONLY ---
pos_rate = float(y[tr_idx_np].mean())
pos_weight = torch.tensor([(1.0 - pos_rate) / max(pos_rate, 1e-12)], device=device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

BATCH = 262144
EPOCHS = 5  # start with 5; increase if val keeps improving

@torch.no_grad()
def eval_on(idx_t, max_n=1_000_000):
    model.eval()
    # optional cap for speed
    if idx_t.numel() > max_n:
        idx_t = idx_t[torch.randperm(idx_t.numel())[:max_n]]
    ps, ys = [], []
    for i in range(0, idx_t.numel(), BATCH):
        batch = idx_t[i:i+BATCH].to(device, non_blocking=True)
        p = torch.sigmoid(model(src_gpu[batch], dst_gpu[batch])).detach().cpu().numpy()
        yt = y_gpu[batch].detach().cpu().numpy()
        ps.append(p); ys.append(yt)
    ps = np.concatenate(ps); ys = np.concatenate(ys)
    return roc_auc_score(ys, ps), average_precision_score(ys, ps), float(ps.mean()), float(ys.mean())

# --- training ---
for ep in range(EPOCHS):
    model.train()
    # shuffle train indices each epoch
    order = tr_idx[torch.randperm(tr_idx.numel())]

    running = 0.0
    for i in trange(0, order.numel(), BATCH, desc=f"train ep{ep}"):
        batch = order[i:i+BATCH].to(device, non_blocking=True)
        s, t, yy = src_gpu[batch], dst_gpu[batch], y_gpu[batch]

        opt.zero_grad(set_to_none=True)
        logits = model(s, t)
        loss = loss_fn(logits, yy)
        loss.backward()
        opt.step()

        running += loss.detach().item() * s.size(0)

    tr_roc, tr_pr, tr_pmean, tr_ymean = eval_on(tr_idx, max_n=500_000)
    va_roc, va_pr, va_pmean, va_ymean = eval_on(val_idx, max_n=500_000)

    print(
        f"Epoch {ep:02d} | loss={running/order.numel():.6f} "
        f"| TRAIN ROC={tr_roc:.4f} PR={tr_pr:.4f} "
        f"| VAL ROC={va_roc:.4f} PR={va_pr:.4f} "
        f"| val_pred_mean={va_pmean:.4f} val_label_mean={va_ymean:.4f}"
    )


train edges: 4570511 val edges: 507834 train pos: 0.0010121407 val pos: 0.0010850002


train ep0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:01<00:00,  9.62it/s]


Epoch 00 | loss=1.334832 | TRAIN ROC=0.6556 PR=0.0052 | VAL ROC=0.6431 PR=0.0083 | val_pred_mean=0.4755 val_label_mean=0.0011


train ep1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:01<00:00,  9.63it/s]


Epoch 01 | loss=1.281710 | TRAIN ROC=0.7216 PR=0.0311 | VAL ROC=0.6745 PR=0.0266 | val_pred_mean=0.4849 val_label_mean=0.0011


train ep2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:01<00:00,  9.63it/s]


Epoch 02 | loss=1.243023 | TRAIN ROC=0.7467 PR=0.0399 | VAL ROC=0.6865 PR=0.0328 | val_pred_mean=0.4464 val_label_mean=0.0011


train ep3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:01<00:00,  9.62it/s]


Epoch 03 | loss=1.205700 | TRAIN ROC=0.7901 PR=0.0457 | VAL ROC=0.7009 PR=0.0326 | val_pred_mean=0.4408 val_label_mean=0.0011


train ep4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:01<00:00,  9.63it/s]


Epoch 04 | loss=1.164788 | TRAIN ROC=0.8136 PR=0.0439 | VAL ROC=0.7177 PR=0.0398 | val_pred_mean=0.4616 val_label_mean=0.0011


In [14]:
model.eval()
with torch.no_grad():
    H = model.emb.weight.detach().cpu().numpy()

print("Embeddings shape:", H.shape)


Embeddings shape: (515088, 128)


In [15]:
import numpy as np

MAX_TRAIN = 1_000_000
MAX_VAL   = 500_000

def build_features(idx_np, max_n):
    if len(idx_np) > max_n:
        idx_np = np.random.choice(idx_np, max_n, replace=False)
    s = src[idx_np]
    t = dst[idx_np]
    Hs = H[s]
    Hd = H[t]
    X = np.concatenate([Hs, Hd, np.abs(Hs - Hd), Hs * Hd], axis=1)
    y_out = y[idx_np]
    return X, y_out

Xtr, ytr = build_features(tr_idx_np, MAX_TRAIN)
Xva, yva = build_features(val_idx_np, MAX_VAL)

print("Xtr:", Xtr.shape, "pos:", ytr.mean())
print("Xva:", Xva.shape, "pos:", yva.mean())


Xtr: (1000000, 512) pos: 0.001042
Xva: (500000, 512) pos: 0.00109


In [16]:
!pip install xgboost




In [17]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score

dtrain = xgb.DMatrix(Xtr, label=ytr)
dval   = xgb.DMatrix(Xva, label=yva)

params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "max_depth": 8,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "scale_pos_weight": (1 - ytr.mean()) / ytr.mean(),
}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=300,
    evals=[(dval, "val")],
    early_stopping_rounds=30,
    verbose_eval=20
)

p = bst.predict(dval)
print("Hybrid VAL ROC:", roc_auc_score(yva, p))
print("Hybrid VAL PR:", average_precision_score(yva, p))


[0]	val-aucpr:0.00138
[20]	val-aucpr:0.03033
[40]	val-aucpr:0.03178
[60]	val-aucpr:0.03283
[80]	val-aucpr:0.03402
[100]	val-aucpr:0.03578
[120]	val-aucpr:0.03741
[140]	val-aucpr:0.03898
[160]	val-aucpr:0.04018
[180]	val-aucpr:0.04234
[200]	val-aucpr:0.04360
[220]	val-aucpr:0.04457
[240]	val-aucpr:0.04578
[260]	val-aucpr:0.04358
[276]	val-aucpr:0.04435
Hybrid VAL ROC: 0.7399334081488271
Hybrid VAL PR: 0.044811399022796655


In [22]:
LARGE_PATH = "/content/drive/MyDrive/HI-Large_Trans.csv"
usecols = ["From Bank","Account","To Bank","Account.1","Is Laundering"]


In [23]:
import pandas as pd
import numpy as np

CHUNK = 2_000_000          # rows per chunk (adjust if RAM tight)
TARGET_EDGES = 12_000_000  # how many edges to keep total (train+val)
KEEP_PROB = None           # auto computed once we know total rows, or set like 0.05

node_to_id = {}
src_list, dst_list, y_list = [], [], []
n_total = 0
n_kept = 0

def get_id(bank, acct):
    key = (int(bank), str(acct))
    i = node_to_id.get(key)
    if i is None:
        i = len(node_to_id)
        node_to_id[key] = i
    return i

# optional: quick first pass to estimate rows so we can set KEEP_PROB
# (skip if you already know approx size)
print("Streaming HI-Large...")

for chunk in pd.read_csv(LARGE_PATH, usecols=usecols, chunksize=CHUNK):
    n = len(chunk)
    n_total += n

    # decide keep prob if not set
    if KEEP_PROB is None:
        # rough adaptive: aim to hit TARGET_EDGES in ~first few chunks
        # after 1 chunk we can estimate
        KEEP_PROB = min(1.0, TARGET_EDGES / max(n_total, 1))
        print("Initial KEEP_PROB:", KEEP_PROB)

    keep = np.random.rand(n) < KEEP_PROB
    if keep.sum() == 0:
        continue

    sub = chunk.loc[keep, ["From Bank","Account","To Bank","Account.1","Is Laundering"]]
    # build ids row-wise (fast enough on sampled subset)
    for fb, fa, tb, ta, lab in sub.itertuples(index=False, name=None):
        s = get_id(fb, fa)
        t = get_id(tb, ta)
        src_list.append(s)
        dst_list.append(t)
        y_list.append(float(lab))

    n_kept = len(y_list)
    print(f"rows_seen={n_total:,} kept={n_kept:,} nodes={len(node_to_id):,}")

    if n_kept >= TARGET_EDGES:
        break

src = np.array(src_list, dtype=np.int64)
dst = np.array(dst_list, dtype=np.int64)
y   = np.array(y_list, dtype=np.float32)

num_nodes = len(node_to_id)
print("DONE. edges:", len(y), "nodes:", num_nodes, "pos_rate:", y.mean())


Streaming HI-Large...
Initial KEEP_PROB: 1.0
rows_seen=2,000,000 kept=2,000,000 nodes=1,309,660
rows_seen=4,000,000 kept=4,000,000 nodes=1,697,530
rows_seen=6,000,000 kept=6,000,000 nodes=1,756,681
rows_seen=8,000,000 kept=8,000,000 nodes=1,761,718
rows_seen=10,000,000 kept=10,000,000 nodes=1,766,235
rows_seen=12,000,000 kept=12,000,000 nodes=1,932,541
DONE. edges: 12000000 nodes: 1932541 pos_rate: 0.0008373333


In [24]:
class EdgeMLP(nn.Module):
    def __init__(self, n_nodes, d=64, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(n_nodes, d)
        self.drop = nn.Dropout(dropout)
        self.net = nn.Sequential(
            nn.Linear(4*d, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(128, 1),
        )
    def forward(self, s, t):
        Hs = self.drop(self.emb(s))
        Hd = self.drop(self.emb(t))
        x = torch.cat([Hs, Hd, (Hs-Hd).abs(), Hs*Hd], dim=1)
        return self.net(x).squeeze(-1)


In [25]:
rng = np.random.default_rng(0)
perm = rng.permutation(len(y))

val_frac = 0.1
n_val = int(len(y) * val_frac)

val_idx_np = perm[:n_val]
tr_idx_np  = perm[n_val:]

print("train:", len(tr_idx_np), "val:", len(val_idx_np))
print("train pos:", y[tr_idx_np].mean(), "val pos:", y[val_idx_np].mean())


train: 10800000 val: 1200000
train pos: 0.0008390741 val pos: 0.0008216667


In [26]:
device = torch.device("cuda")

src_t = torch.from_numpy(src).long()
dst_t = torch.from_numpy(dst).long()
y_t   = torch.from_numpy(y).float()

src_gpu = src_t.to(device, non_blocking=True)
dst_gpu = dst_t.to(device, non_blocking=True)
y_gpu   = y_t.to(device, non_blocking=True)

tr_idx = torch.from_numpy(tr_idx_np).long()
val_idx = torch.from_numpy(val_idx_np).long()

pos_rate = float(y[tr_idx_np].mean())
pos_weight = torch.tensor([(1.0 - pos_rate) / max(pos_rate, 1e-12)], device=device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

model = EdgeMLP(num_nodes, d=64, dropout=0.3).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

BATCH = 131072
EPOCHS = 3


In [27]:
@torch.no_grad()
def eval_on(idx_np, cap=300_000):
    model.eval()
    if len(idx_np) > cap:
        idx_np = np.random.choice(idx_np, cap, replace=False)
    ps, ys = [], []
    for i in range(0, len(idx_np), BATCH):
        b = torch.from_numpy(idx_np[i:i+BATCH]).to(device)
        p = torch.sigmoid(model(src_gpu[b], dst_gpu[b])).cpu().numpy()
        yt = y_gpu[b].cpu().numpy()
        ps.append(p); ys.append(yt)
    ps = np.concatenate(ps); ys = np.concatenate(ys)
    return roc_auc_score(ys, ps), average_precision_score(ys, ps)


In [28]:
for ep in range(EPOCHS):
    model.train()
    order = tr_idx_np.copy()
    np.random.shuffle(order)

    running = 0.0
    for i in trange(0, len(order), BATCH, desc=f"HI-Large train ep{ep}"):
        b = torch.from_numpy(order[i:i+BATCH]).to(device)
        s, t, yy = src_gpu[b], dst_gpu[b], y_gpu[b]

        opt.zero_grad(set_to_none=True)
        logits = model(s, t)
        loss = loss_fn(logits, yy)
        loss.backward()
        opt.step()

        running += loss.detach().item() * s.size(0)

    tr_roc, tr_pr = eval_on(tr_idx_np)
    va_roc, va_pr = eval_on(val_idx_np)
    print(f"Epoch {ep:02d} | loss={running/len(order):.6f} | TRAIN ROC={tr_roc:.4f} PR={tr_pr:.4f} | VAL ROC={va_roc:.4f} PR={va_pr:.4f}")


HI-Large train ep0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.48it/s]


Epoch 00 | loss=1.289354 | TRAIN ROC=0.6451 PR=0.0014 | VAL ROC=0.6085 PR=0.0012


HI-Large train ep1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.48it/s]


Epoch 01 | loss=1.259434 | TRAIN ROC=0.6906 PR=0.0018 | VAL ROC=0.6370 PR=0.0012


HI-Large train ep2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.48it/s]


Epoch 02 | loss=1.244911 | TRAIN ROC=0.7363 PR=0.0037 | VAL ROC=0.6815 PR=0.0033


In [29]:
BEST_PR = -1
patience = 3
bad = 0
best_state = None

EPOCHS = 10

for ep in range(EPOCHS):
    model.train()
    order = tr_idx_np.copy()
    np.random.shuffle(order)

    running = 0.0
    for i in trange(0, len(order), BATCH, desc=f"HI-Large train ep{ep}"):
        b = torch.from_numpy(order[i:i+BATCH]).to(device)
        s, t, yy = src_gpu[b], dst_gpu[b], y_gpu[b]

        opt.zero_grad(set_to_none=True)
        logits = model(s, t)
        loss = loss_fn(logits, yy)
        loss.backward()
        opt.step()
        running += loss.detach().item() * s.size(0)

    tr_roc, tr_pr = eval_on(tr_idx_np)
    va_roc, va_pr = eval_on(val_idx_np)
    print(f"Epoch {ep:02d} | loss={running/len(order):.6f} | TRAIN ROC={tr_roc:.4f} PR={tr_pr:.4f} | VAL ROC={va_roc:.4f} PR={va_pr:.4f}")

    # early stop on VAL PR
    if va_pr > BEST_PR + 1e-5:
        BEST_PR = va_pr
        bad = 0
        best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
        print("  ‚úÖ new best VAL PR")
    else:
        bad += 1
        print(f"  ‚ö†Ô∏è no improvement ({bad}/{patience})")
        if bad >= patience:
            print("üõë early stopping")
            break

# restore best
if best_state is not None:
    model.load_state_dict(best_state)
    model.to(device)
print("Best VAL PR:", BEST_PR)


HI-Large train ep0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.47it/s]


Epoch 00 | loss=1.203273 | TRAIN ROC=0.8062 PR=0.0171 | VAL ROC=0.7171 PR=0.0252
  ‚úÖ new best VAL PR


HI-Large train ep1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.48it/s]


Epoch 01 | loss=1.106275 | TRAIN ROC=0.8582 PR=0.0378 | VAL ROC=0.7503 PR=0.0250
  ‚ö†Ô∏è no improvement (1/3)


HI-Large train ep2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.47it/s]


Epoch 02 | loss=0.937888 | TRAIN ROC=0.9592 PR=0.1764 | VAL ROC=0.8053 PR=0.0761
  ‚úÖ new best VAL PR


HI-Large train ep3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.46it/s]


Epoch 03 | loss=0.713609 | TRAIN ROC=0.9737 PR=0.3046 | VAL ROC=0.8223 PR=0.0948
  ‚úÖ new best VAL PR


HI-Large train ep4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.47it/s]


Epoch 04 | loss=0.519055 | TRAIN ROC=0.9879 PR=0.3421 | VAL ROC=0.8380 PR=0.1428
  ‚úÖ new best VAL PR


HI-Large train ep5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.46it/s]


Epoch 05 | loss=0.381916 | TRAIN ROC=0.9907 PR=0.4215 | VAL ROC=0.8453 PR=0.1741
  ‚úÖ new best VAL PR


HI-Large train ep6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.46it/s]


Epoch 06 | loss=0.278477 | TRAIN ROC=0.9976 PR=0.4799 | VAL ROC=0.8469 PR=0.1635
  ‚ö†Ô∏è no improvement (1/3)


HI-Large train ep7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.46it/s]


Epoch 07 | loss=0.202359 | TRAIN ROC=0.9989 PR=0.5715 | VAL ROC=0.8570 PR=0.1995
  ‚úÖ new best VAL PR


HI-Large train ep8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.47it/s]


Epoch 08 | loss=0.149064 | TRAIN ROC=0.9990 PR=0.5880 | VAL ROC=0.8395 PR=0.1867
  ‚ö†Ô∏è no improvement (1/3)


HI-Large train ep9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:06<00:00, 12.47it/s]


Epoch 09 | loss=0.115515 | TRAIN ROC=0.9993 PR=0.6150 | VAL ROC=0.8609 PR=0.1661
  ‚ö†Ô∏è no improvement (2/3)
Best VAL PR: 0.19948521553767654


In [30]:
model.eval()
H = model.emb.weight.detach().cpu().numpy().astype(np.float32)
print("H:", H.shape)


H: (1932541, 64)


In [31]:
import numpy as np

XTR_MAX = 2_000_000
XVA_MAX = 1_000_000

def build_X(idx_np, max_n):
    if len(idx_np) > max_n:
        idx_np = np.random.choice(idx_np, max_n, replace=False)
    s = src[idx_np]; t = dst[idx_np]
    Hs = H[s]; Hd = H[t]
    X = np.concatenate([Hs, Hd, np.abs(Hs-Hd), Hs*Hd], axis=1)
    return X, y[idx_np]

Xtr, ytr = build_X(tr_idx_np, XTR_MAX)
Xva, yva = build_X(val_idx_np, XVA_MAX)
print("Xtr", Xtr.shape, "pos", ytr.mean())
print("Xva", Xva.shape, "pos", yva.mean())


Xtr (2000000, 256) pos 0.000846
Xva (1000000, 256) pos 0.000818


In [32]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score

dtrain = xgb.DMatrix(Xtr, label=ytr)
dval   = xgb.DMatrix(Xva, label=yva)

params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "max_depth": 8,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "scale_pos_weight": (1 - ytr.mean()) / ytr.mean(),
}

bst = xgb.train(
    params, dtrain,
    num_boost_round=800,
    evals=[(dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=25
)

p = bst.predict(dval)
print("Hybrid HI-Large VAL ROC:", roc_auc_score(yva, p))
print("Hybrid HI-Large VAL PR :", average_precision_score(yva, p))


[0]	val-aucpr:0.00562
[25]	val-aucpr:0.05346
[50]	val-aucpr:0.08565
[75]	val-aucpr:0.10495
[100]	val-aucpr:0.11759
[125]	val-aucpr:0.12124
[150]	val-aucpr:0.12233
[175]	val-aucpr:0.12490
[200]	val-aucpr:0.12687
[225]	val-aucpr:0.12871
[250]	val-aucpr:0.12822
[275]	val-aucpr:0.12945
[300]	val-aucpr:0.12662
[309]	val-aucpr:0.12363
Hybrid HI-Large VAL ROC: 0.868780762908558
Hybrid HI-Large VAL PR : 0.1246230930943737


In [33]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score

H = model.emb.weight.detach().cpu().numpy().astype(np.float32)

def make_X_from_idx(idx_np):
    s = src[idx_np]; t = dst[idx_np]
    Hs = H[s]; Hd = H[t]
    X = np.concatenate([Hs, Hd, np.abs(Hs - Hd), Hs * Hd], axis=1)
    yb = y[idx_np].astype(np.float32)
    return X, yb


In [34]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "max_depth": 8,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "scale_pos_weight": (1 - y[tr_idx_np].mean()) / y[tr_idx_np].mean(),
}

CHUNK_EDGES = 500_000   # increase if RAM allows (1M ok on High-RAM)
NUM_ROUNDS_PER_CHUNK = 50
bst = None

# fixed validation set (use ALL val edges, no sampling)
Xva, yva = make_X_from_idx(val_idx_np)
dval = xgb.DMatrix(Xva, label=yva)

order = tr_idx_np.copy()
np.random.shuffle(order)

for start in range(0, len(order), CHUNK_EDGES):
    idx_chunk = order[start:start+CHUNK_EDGES]
    Xc, yc = make_X_from_idx(idx_chunk)
    dtrain = xgb.DMatrix(Xc, label=yc)

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=NUM_ROUNDS_PER_CHUNK,
        evals=[(dval, "val")],
        xgb_model=bst,          # <-- continues training
        verbose_eval=False
    )

    p = bst.predict(dval)
    roc = roc_auc_score(yva, p)
    pr  = average_precision_score(yva, p)
    print(f"chunk {start//CHUNK_EDGES:03d} | seen={start+len(idx_chunk):,}/{len(order):,} | VAL ROC={roc:.4f} | VAL PR={pr:.4f}")


chunk 000 | seen=500,000/10,800,000 | VAL ROC=0.8368 | VAL PR=0.0640
chunk 001 | seen=1,000,000/10,800,000 | VAL ROC=0.8535 | VAL PR=0.0988
chunk 002 | seen=1,500,000/10,800,000 | VAL ROC=0.7663 | VAL PR=0.0044
chunk 003 | seen=2,000,000/10,800,000 | VAL ROC=0.7793 | VAL PR=0.0078
chunk 004 | seen=2,500,000/10,800,000 | VAL ROC=0.7562 | VAL PR=0.0130
chunk 005 | seen=3,000,000/10,800,000 | VAL ROC=0.6953 | VAL PR=0.0126
chunk 006 | seen=3,500,000/10,800,000 | VAL ROC=0.6820 | VAL PR=0.0125
chunk 007 | seen=4,000,000/10,800,000 | VAL ROC=0.6906 | VAL PR=0.0148
chunk 008 | seen=4,500,000/10,800,000 | VAL ROC=0.6839 | VAL PR=0.0223
chunk 009 | seen=5,000,000/10,800,000 | VAL ROC=0.6920 | VAL PR=0.0177
chunk 010 | seen=5,500,000/10,800,000 | VAL ROC=0.6864 | VAL PR=0.0216
chunk 011 | seen=6,000,000/10,800,000 | VAL ROC=0.6854 | VAL PR=0.0252
chunk 012 | seen=6,500,000/10,800,000 | VAL ROC=0.6894 | VAL PR=0.0215
chunk 013 | seen=7,000,000/10,800,000 | VAL ROC=0.6898 | VAL PR=0.0211
chunk 01