In [1]:
import torch
from torch_geometric.data import Data   
import pandas as pd
import random
from itertools import combinations, islice

def create_edges(feature_column, df, max_edges_per_group=100):
    edge_list = []
    groups = df.groupby(feature_column).indices
    for _, indices in groups.items():
        n = len(indices)   
        if n < 2:
            continue
        
        pair_generator = combinations(indices, 2)
        limited_pairs = list(islice(pair_generator, max_edges_per_group))
        edge_list.extend(limited_pairs)

    if not edge_list:
        return torch.empty((2, 0), dtype=torch.int32)
    return torch.tensor(edge_list, dtype=torch.int32).t().contiguous()


X = pd.read_csv("reduced_features.csv") 
y = pd.read_csv("balanced_labels.csv").values

#create edges for relational features
edge_index = torch.empty((2, 0), dtype=torch.int32)
edge_features = ['card1', 'addr1', 'addr2', 'P_emaildomain', 'DeviceType', 'id_17', 'id_28']
for feature in edge_features:
    edges = create_edges(feature, X, max_edges_per_group=100)
    edge_index = torch.cat([edge_index, edges], dim=1)

x_node = X.drop(columns= edge_features)

data = Data(
    x = torch.tensor(x_node.values,
                    dtype=torch.float32),
                    edge_index = edge_index,
                    y = torch.tensor(y, dtype=torch.float32)
    )


In [2]:
from torch_geometric.nn import GCNConv
#from torch_geometric.transforms import RandomNodeSplit

#transform = RandomNodeSplit(split="train_rest", num_val=0.15, num_test=0.15)
#data = transform(data)

class FraudGNN(torch.nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.conv1 = GCNConv(input_dim, 64)
        self.conv2 = GCNConv(64, 32)
        self.classifier = torch.nn.Linear(32, 1)
            
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        return torch.sigmoid(self.classifier(x))
        

In [10]:
from sklearn.model_selection import KFold
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def cross_validate(model_class, data, num_folds=5, num_epochs=200, lr=0.001,
                    device=None, verbose=True):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    best_model_state = None
    best_model = None
    best_metric = float('-inf')


    node_indices = torch.arange(data.num_nodes)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(node_indices)):
        if verbose:
            print(f"Fold {fold + 1}/{num_folds}")

        train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

        train_mask[train_idx] = True
        val_mask[val_idx] = True

   

        model = model_class(input_dim=data.num_node_features).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        loss_fn = torch.nn.BCELoss()

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            out = model(data.to(device))
            loss = loss_fn(out[train_mask], data.y[train_mask].to(device))
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            #val_preds = (model(data)[val_mask] > 0.5).float()
            #accuracy = (val_preds == data.y[val_mask]).sum() / len(val_preds)
            logits = model(data).squeeze()
            probs = logits[val_mask].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            labels = data.y[val_mask].cpu().numpy().flatten()

            accuracy = (preds == labels).sum() / len(preds)
            precision = precision_score(labels, preds, zero_division=0)
            recall = recall_score(labels, preds, zero_division=0)
            f1 = f1_score(labels, preds, zero_division=0)
            roc_auc = roc_auc_score(labels, probs)

        results.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc
        })

        if accuracy > best_metric:
            best_metric = accuracy
            best_model_state = model.state_dict()
            best_model = model

        if best_model_state is not None:
            torch.save(best_model_state, "best_fraudgnn_model.pt" )
            with open("fraudgnn_model.pkl", "wb") as f:
                pickle.dump(best_model, f)


        metrics = {}
        for key in results[0].keys():
            values = [fold[key] for fold in results]
            metrics[f"mean_{key}"] = sum(values) / num_folds
            metrics[f"std_{key}"] = (sum((x - metrics[f"mean_{key}"])**2 for x in values) / num_folds) ** 0.5
   

   

    return metrics, results
        


In [11]:
metrics, results = cross_validate(FraudGNN, data)
print(results)


Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
[{'accuracy': 0.9826673276274287, 'precision': 0.9984602286107639, 'recall': 0.9668382785023286, 'f1': 0.9823948525774987, 'roc_auc': 0.9867504290959459}, {'accuracy': 0.9826366192734404, 'precision': 0.9990281650484555, 'recall': 0.9661733058105318, 'f1': 0.9823260964696846, 'roc_auc': 0.9867555204855827}, {'accuracy': 0.9826585538120035, 'precision': 0.9990741665229507, 'recall': 0.9661955758426967, 'f1': 0.9823598448844443, 'roc_auc': 0.9867318429647408}, {'accuracy': 0.9821189641633509, 'precision': 0.9971271636613306, 'recall': 0.96707321347212, 'f1': 0.9818702629612497, 'roc_auc': 0.984904746441815}, {'accuracy': 0.9831498135556043, 'precision': 0.9989940549554122, 'recall': 0.9672700788852522, 'f1': 0.9828761474229516, 'roc_auc': 0.9875578074397584}]


In [8]:
model = FraudGNN(input_dim=data.num_node_features)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCELoss()

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    #print(f"Epoch: {epoch}, Loss: {loss:.4f}")



In [12]:
model.eval()
with torch.no_grad():
    test_preds = (model(data)[data.test_mask] > 0.5).float()
    accuracy = (test_preds == data.y[data.test_mask]).sum() / len(test_preds)

print(f"Test Accuracy: {accuracy.item():.4f}") 

Test Accuracy: 0.9832


In [20]:
# Compute TP and FP as scalars
TP = ((test_preds == 1) & (data.y[data.test_mask] == 1)).sum().item()
FP = ((test_preds == 1) & (data.y[data.test_mask] == 0)).sum().item()
FN = ((test_preds == 0) & (data.y[data.test_mask] == 1)).sum().item()


# Use .item() to convert the tensor to a scalar for the condition
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Precision: 0.9995
Recall: 0.9668
F1-Score: 0.9829
