In [6]:
import torch
from torch_geometric.data import Data   
import pandas as pd
import random
from itertools import combinations, islice

def create_edges(feature_column, df, max_edges_per_group=100):
    edge_list = []
    groups = df.groupby(feature_column).indices
    for _, indices in groups.items():
        n = len(indices)   
        if n < 2:
            continue
        
        pair_generator = combinations(indices, 2)
        limited_pairs = list(islice(pair_generator, max_edges_per_group))
        edge_list.extend(limited_pairs)

    if not edge_list:
        return torch.empty((2, 0), dtype=torch.int32)
    return torch.tensor(edge_list, dtype=torch.int32).t().contiguous()


X = pd.read_csv("reduced_features.csv") 
y = pd.read_csv("balanced_labels.csv").values

#create edges for relational features
edge_index = torch.empty((2, 0), dtype=torch.int32)
edge_features = ['card1', 'addr1', 'addr2', 'P_emaildomain', 'DeviceType', 'id_17', 'id_28']
for feature in edge_features:
    edges = create_edges(feature, X, max_edges_per_group=100)
    edge_index = torch.cat([edge_index, edges], dim=1)

x_node = X.drop(columns= edge_features)

data = Data(
    x = torch.tensor(x_node.values,
                    dtype=torch.float32),
                    edge_index = edge_index,
                    y = torch.tensor(y, dtype=torch.float32)
    )


In [7]:
from torch_geometric.nn import GCNConv
from torch_geometric.transforms import RandomNodeSplit

transform = RandomNodeSplit(split="train_rest", num_val=0.15, num_test=0.15)
data = transform(data)

class FraudGNN(torch.nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.conv1 = GCNConv(input_dim, 64)
        self.conv2 = GCNConv(64, 32)
        self.classifier = torch.nn.Linear(32, 1)
            
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        return torch.sigmoid(self.classifier(x))
        

In [8]:
model = FraudGNN(input_dim=data.num_node_features)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCELoss()

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    #print(f"Epoch: {epoch}, Loss: {loss:.4f}")



In [12]:
model.eval()
with torch.no_grad():
    test_preds = (model(data)[data.test_mask] > 0.5).float()
    accuracy = (test_preds == data.y[data.test_mask]).sum() / len(test_preds)

print(f"Test Accuracy: {accuracy.item():.4f}") 

Test Accuracy: 0.9832


In [20]:
# Compute TP and FP as scalars
TP = ((test_preds == 1) & (data.y[data.test_mask] == 1)).sum().item()
FP = ((test_preds == 1) & (data.y[data.test_mask] == 0)).sum().item()
FN = ((test_preds == 0) & (data.y[data.test_mask] == 1)).sum().item()


# Use .item() to convert the tensor to a scalar for the condition
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Precision: 0.9995
Recall: 0.9668
F1-Score: 0.9829
