In [1]:
import os, json, numpy as np, pandas as pd
import networkx as nx
from networkx.readwrite import json_graph

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

DATA_DIR = ''

Mounted at /content/gdrive


In [3]:
PROC_DIR = os.path.join(DATA_DIR, 'processed')
OUT_PREFIX = os.path.join(PROC_DIR, 'graphsage_edges')

train_pr  = pd.read_parquet(os.path.join(PROC_DIR, 'train_clean.parquet'))
valid_pr  = pd.read_parquet(os.path.join(PROC_DIR, 'valid_clean.parquet'))
test_pr   = pd.read_parquet(os.path.join(PROC_DIR, 'test_clean.parquet'))
principal_feats = pd.read_parquet(os.path.join(PROC_DIR, 'principal_feats.parquet'))

In [None]:
print(f'train_pr.columns:\n{train_pr.columns}\n\n')
print(f'valid_pr.columns:\n{valid_pr.columns}\n\n')
print(f'test_pr.columns:\n{test_pr.columns}\n\n')
print(f'principal_feats.columns:\n{principal_feats.columns}\n\n')

In [5]:
def merge_feats(split_df, feats_df):
    already_in_split = {
        'timestamp','src_bank','src_account','dst_bank','dst_account',
        'amount_received','recv_currency','amount_paid','pay_currency',
        'payment_format','label','currency_pair','fx_spread','tx_id'
    }

    candidate_feats = [
        'cp_bucket',
        '_dow','_hour','_amt_log','_same_bank',
        '_src_tx_count_prev','_src_amt_sum_prev','_src_amt_mean_prev','_src_amt_std_prev','_src_secs_since_last','_src_amt_z',
        '_dst_rx_count_prev','_dst_amt_sum_prev','_dst_amt_mean_prev','_dst_amt_std_prev','_dst_secs_since_last','_dst_amt_z',
        '_src_roll_mean_K','_src_roll_std_K','_src_roll_sum_K','_src_ewm_mean',
        '_dst_roll_mean_K','_dst_roll_std_K','_dst_roll_sum_K','_dst_ewm_mean'
    ]
    feat_cols = ['tx_id'] + [c for c in candidate_feats if c in feats_df.columns]

    merged = split_df.merge(feats_df[feat_cols], on='tx_id', how='left', validate='one_to_one')
    return merged

In [6]:
train_m = merge_feats(train_pr, principal_feats)
valid_m = merge_feats(valid_pr, principal_feats)
test_m  = merge_feats(test_pr,  principal_feats)

In [None]:
for name, dfm in [('train',train_m),('valid',valid_m),('test',test_m)]:
    for c in ['src_account','dst_account','label','fx_spread','tx_id']:
        assert c in dfm.columns, f"[{name}] faltando coluna {c}"
print("OK: merge sem sufixos indesejados.")

In [None]:
accounts_all = pd.Index(pd.unique(pd.concat([
    train_m['src_account'], train_m['dst_account'],
    valid_m['src_account'], valid_m['dst_account'],
    test_m['src_account'],  test_m['dst_account']
], ignore_index=True)))
id_map = {acc: int(i) for i, acc in enumerate(accounts_all)}
with open(f"{OUT_PREFIX}-id_map.json", "w") as f:
    json.dump(id_map, f)
print("OK:", f"{OUT_PREFIX}-id_map.json", "| num_nodes:", len(accounts_all))

In [9]:
acc2idx = pd.Series(range(len(accounts_all)), index=accounts_all)

In [None]:
n = len(accounts_all)
deg_out = np.zeros(n, dtype=np.int32)
deg_in  = np.zeros(n, dtype=np.int32)
sum_out = np.zeros(n, dtype=np.float32)
sum_in  = np.zeros(n, dtype=np.float32)

np.add.at(deg_out, acc2idx.loc[train_m['src_account']].to_numpy(), 1)
np.add.at(deg_in,  acc2idx.loc[train_m['dst_account']].to_numpy(), 1)
np.add.at(sum_out, acc2idx.loc[train_m['src_account']].to_numpy(), train_m['amount_paid'].astype('float32').to_numpy())
np.add.at(sum_in,  acc2idx.loc[train_m['dst_account']].to_numpy(), train_m['amount_paid'].astype('float32').to_numpy())

X_nodes = np.stack([deg_out, deg_in, np.log1p(sum_out), np.log1p(sum_in)], axis=1).astype('float32')
np.save(f"{OUT_PREFIX}-feats.npy", X_nodes)
print("OK:", f"{OUT_PREFIX}-feats.npy", "| shape:", X_nodes.shape)

In [11]:
G = nx.Graph()
for acc in accounts_all:
    G.add_node(acc)

In [12]:
def add_edges_unique(df):
    pairs = pd.DataFrame({
        'u': np.where(df['src_account'].values <= df['dst_account'].values, df['src_account'].values, df['dst_account'].values),
        'v': np.where(df['src_account'].values <= df['dst_account'].values, df['dst_account'].values, df['src_account'].values),
    })
    pairs = pairs.drop_duplicates()
    for u, v in pairs.itertuples(index=False):
        if u != v and not G.has_edge(u, v):
            G.add_edge(u, v)

In [13]:
add_edges_unique(train_m)
add_edges_unique(valid_m)
add_edges_unique(test_m)

In [None]:
G_json = json_graph.node_link_data(G)
with open(f"{OUT_PREFIX}-G.json", "w") as f:
    json.dump(G_json, f)
print("OK:", f"{OUT_PREFIX}-G.json", "| nodes:", G.number_of_nodes(), "| edges:", G.number_of_edges())

In [15]:
EDGE_FEATS = [c for c in [
    '_dow','_hour','_amt_log','_same_bank',
    '_src_tx_count_prev','_src_amt_sum_prev','_src_amt_mean_prev','_src_amt_std_prev','_src_secs_since_last','_src_amt_z',
    '_dst_rx_count_prev','_dst_amt_sum_prev','_dst_amt_mean_prev','_dst_amt_std_prev','_dst_secs_since_last','_dst_amt_z',
    'fx_spread',
    '_src_roll_mean_K','_src_roll_std_K','_src_roll_sum_K','_src_ewm_mean',
    '_dst_roll_mean_K','_dst_roll_std_K','_dst_roll_sum_K','_dst_ewm_mean'
] if c in train_m.columns]

In [16]:
def export_edge_split(name, dfm):
    src_idx = acc2idx.loc[dfm['src_account']].to_numpy().astype('int64')
    dst_idx = acc2idx.loc[dfm['dst_account']].to_numpy().astype('int64')

    E_attr = dfm[EDGE_FEATS].to_numpy().astype('float32')
    y      = dfm['label'].to_numpy().astype('int64')
    txids  = dfm['tx_id'].to_numpy().astype('int64')

    cp_b  = (dfm['cp_bucket'].to_numpy().astype('int64') if 'cp_bucket' in dfm.columns else np.zeros(len(dfm), dtype='int64'))

    np.save(f"{OUT_PREFIX}-{name}_edges.npy",     np.stack([src_idx, dst_idx], axis=1))
    np.save(f"{OUT_PREFIX}-{name}_edge_feats.npy", E_attr)
    np.save(f"{OUT_PREFIX}-{name}_edge_labels.npy", y)
    np.save(f"{OUT_PREFIX}-{name}_cp_bucket.npy",  cp_b)
    np.save(f"{OUT_PREFIX}-{name}_txids.npy",      txids)

    print(f"OK: {name} | edges:", len(dfm), "| feat_dim:", E_attr.shape[1])

In [None]:
export_edge_split("train", train_m)
export_edge_split("valid", valid_m)
export_edge_split("test",  test_m)

In [18]:
meta = {
    "edge_feat_names": EDGE_FEATS,
    "node_feat_names": ["deg_out","deg_in","log1p_sum_out","log1p_sum_in"],
    "has_cp_bucket": True,
    "cp_n_buckets": int(dfm['cp_bucket'].max()+1 if 'cp_bucket' in dfm.columns else 1)
}
with open(f"{OUT_PREFIX}-meta.json", "w") as f:
    json.dump(meta, f, indent=2)

In [19]:
split_info = {
    "train": int(len(train_m)),
    "valid": int(len(valid_m)),
    "test":  int(len(test_m))
}
with open(f"{OUT_PREFIX}-edge_split.json", "w") as f:
    json.dump(split_info, f, indent=2)

In [None]:
print("\nArquivos gerados (prefix):", OUT_PREFIX)
print(" -G.json, -id_map.json, -feats.npy")
print(" -train_edges.npy / -valid_edges.npy / -test_edges.npy        (pares [src_id,dst_id])")
print(" -train_edge_feats.npy / ...                                  (float32, mesma ordem)")
print(" -train_edge_labels.npy / ...                                 (0/1, mesma ordem)")
print(" -train_cp_bucket.npy / ...                                   (int64, p/ embedding)")
print(" -train_txids.npy / ...                                       (referência tx_id)")
print(" -meta.json, -edge_split.json")

In [None]:
import os, numpy as np, json, glob
OUT_PREFIX = os.path.join(PROC_DIR, 'graphsage_edges')

for suf in ['-G.json','-id_map.json','-feats.npy']:
    p = OUT_PREFIX + suf
    print(suf, os.path.exists(p), p)

for name in ['train','valid','test']:
    edges = np.load(f"{OUT_PREFIX}-{name}_edges.npy")
    feats = np.load(f"{OUT_PREFIX}-{name}_edge_feats.npy")
    labs  = np.load(f"{OUT_PREFIX}-{name}_edge_labels.npy")
    cpb   = np.load(f"{OUT_PREFIX}-{name}_cp_bucket.npy")
    txids = np.load(f"{OUT_PREFIX}-{name}_txids.npy")
    print(f"{name}: edges={edges.shape}, feats={feats.shape}, labels={labs.shape}, cp={cpb.shape}, txids={txids.shape}")
    assert edges.shape[0] == feats.shape[0] == labs.shape[0] == cpb.shape[0] == txids.shape[0]
    assert edges.shape[1] == 2  # [src_id, dst_id]

with open(f"{OUT_PREFIX}-meta.json") as f:
    meta = json.load(f)
print("edge_feat_names:", len(meta["edge_feat_names"]))
print("node_feat_names:", meta["node_feat_names"])


In [None]:
id_map = json.load(open(f"{OUT_PREFIX}-id_map.json"))
num_nodes = len(id_map)
for name in ['train','valid','test']:
    edges = np.load(f"{OUT_PREFIX}-{name}_edges.npy")
    mn, mx = edges.min(), edges.max()
    print(name, "node_id range:", mn, "->", mx, " (num_nodes =", num_nodes, ")")
    assert mn >= 0 and mx < num_nodes