In [1]:
# This is a sample Python script.
import pandas as pd
import torch
device = torch.device('cpu')
import dgl
import networkx as nx
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
import dgl.function as fn


In [11]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)

print(df.head())

         step      type     amount     nameOrig  oldbalanceOrg  \
3136595   236  CASH_OUT   81990.42   C770683588           0.00   
3603759   263   PAYMENT    3567.16  C1382661261       49225.32   
2230134   186  CASH_OUT   97934.16  C1836567111           0.00   
290198     15   CASH_IN  106967.79  C1478509364     1933159.03   
6161736   550   PAYMENT     975.30   C364317589           0.00   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
3136595            0.00  C1601899904      1862480.76      1944471.18        0   
3603759        45658.16   M869224147            0.00            0.00        0   
2230134            0.00   C314667225       288375.89       386310.05        0   
290198       2040126.82  C1786291008      1344593.37      1148609.99        0   
6161736            0.00   M816906112            0.00            0.00        0   

         isFlaggedFraud  
3136595               0  
3603759               0  
2230134               0  
290198      

Adjust data proportions as the fraud data is heavily biased having less fraudulant cases.

In [12]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.9985

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")


Fraud count: 6473
Non-fraud count: 7490
Fraud ratio: 0.4636
Non-fraud ratio: 0.5364


In [13]:
# Create a mapping from unique user names to numeric IDs (nodes)
user_mapping = {user: idx for idx, user in enumerate(set(df['nameOrig']).union(set(df['nameDest'])))}

# Create edges between nameOrig and nameDest
src = df['nameOrig'].map(user_mapping).values
dst = df['nameDest'].map(user_mapping).values

In [14]:
print(src)

[ 4399  1489  6576 ... 19390 10962 23283]


In [15]:
# Create a DGL graph from the source and destination nodes
g = dgl.graph((src, dst))


In [16]:
# Add transaction amount as edge feature
g.edata['amount'] = torch.tensor(df['amount'].values, dtype=torch.float32)

# Optional: Add fraud information to edge features
g.edata['isFraud'] = torch.tensor(df['isFraud'].values, dtype=torch.float32)

# Initialize node features with zeros (this handles all nodes)
num_nodes = g.num_nodes()
balance_orig = torch.zeros(num_nodes, dtype=torch.float32)
balance_dest = torch.zeros(num_nodes, dtype=torch.float32)

# Average balance for each origin user (nameOrig)
for orig_user, balance in df.groupby('nameOrig')['oldbalanceOrg'].mean().items():
    balance_orig[user_mapping[orig_user]] = balance

# Average balance for each destination user (nameDest)
for dest_user, balance in df.groupby('nameDest')['oldbalanceDest'].mean().items():
    balance_dest[user_mapping[dest_user]] = balance

node_features = torch.stack([balance_orig, balance_dest], dim=1)  # Changed to stack both features

g.ndata['features'] = node_features


In [17]:
g.edata['isFraud'].shape
#node_features.shape
g.num_edges

<bound method DGLGraph.num_edges of Graph(num_nodes=27808, num_edges=13963,
      ndata_schemes={'features': Scheme(shape=(2,), dtype=torch.float32)}
      edata_schemes={'amount': Scheme(shape=(), dtype=torch.float32), 'isFraud': Scheme(shape=(), dtype=torch.float32)})>

In [18]:
# Print graph information
#print(g)

# Visualize the graph using NetworkX (convert DGL graph to NetworkX for visualization)
#nx_graph = g.to_networkx()

# Optional: Visualize using a layout for better readability
#pos = nx.spring_layout(nx_graph)  # Use a layout for better visualization
#plt.figure(figsize=(12, 12))
#nx.draw(nx_graph, pos, node_size=50, node_color='skyblue', font_size=10, with_labels=True)
#plt.show()

In [19]:
# Assuming that you have edge features like transaction amounts or fraud status
edge_features = g.edata.get('amount', None)  # Assuming 'amount' is an edge feature
if edge_features is not None:
    print("Edge Features (Transaction Amounts):")
    print(edge_features)
else:
    print("No edge features found.")


Edge Features (Transaction Amounts):
tensor([4.4701e+05, 7.1600e+01, 1.0000e+07,  ..., 4.6238e+06, 9.8904e+04,
        1.4662e+04])


In [20]:
edges = g.edges()
# Split the dataset into train and test set
# Generate indices for train-test split (80% train, 20% test)
num_train_edges = int(0.8 * len(edges[0]))
train_indices = torch.arange(num_train_edges)
test_indices = torch.arange(num_train_edges, len(edges[0]))

# Create masks for training and testing
train_mask = torch.zeros(len(edges[0]), dtype=torch.bool)
test_mask = torch.zeros(len(edges[0]), dtype=torch.bool)
train_mask[train_indices] = 1
test_mask[test_indices] = 1


In [21]:
# Define the GraphSAGE model for fraud detection
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GraphSAGE, self).__init__()
        self.layer1 = dgl.nn.SAGEConv(in_feats, hidden_feats, 'mean')
        self.layer2 = dgl.nn.SAGEConv(hidden_feats, out_feats, 'mean')
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(out_feats * 2, 1)  # * Concatenate source and destination node embeddings *
        
    def forward(self, g, features):
        # Apply first GraphSAGE layer and ReLU
        x = self.layer1(g, features)
        x = torch.relu(x)
        x = self.dropout(x)

        # Apply second GraphSAGE layer to get embeddings
        x = self.layer2(g, x)
        
        # Get source and destination nodes for each edge
        src, dst = g.edges()  # Get indices of source and destination nodes
        src_embeddings = x[src]  # Embeddings for source nodes
        dst_embeddings = x[dst]  # Embeddings for destination nodes
        
        # Concatenate source and destination node embeddings to create edge features
        edge_features = torch.cat([src_embeddings, dst_embeddings], dim=1)  # Concatenate along the feature dimension
        logits = self.fc(edge_features).squeeze()  # * Output a single value per edge (fraud score) *
        return logits


In [22]:
# Count occurrences of each label in the 'isFraud' column
label_counts = df['isFraud'].value_counts()

# Calculate the proportion of each class
fraud_ratio = label_counts[1] / len(df)  # Assuming '1' represents fraud
non_fraud_ratio = label_counts[0] / len(df)  # Assuming '0' represents non-fraud

# Print the results
print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")

Fraud count: 6473
Non-fraud count: 7490
Fraud ratio: 0.4636
Non-fraud ratio: 0.5364


In [23]:
# Instantiate the model
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
in_feats = 2  # balanceOrig and balanceDest features
hidden_feats = 64
out_feats = 1  # Fraud (binary classification)

pos_weight = torch.tensor([non_fraud_ratio/fraud_ratio],dtype=torch.float)
#pos_weight = torch.tensor([1.0],dtype=torch.float)

model = GraphSAGE(in_feats, hidden_feats, out_feats)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Train the model
epochs = 200
for epoch in range(epochs):
    model.train()

    # Forward pass
    logits = model(g, g.ndata['features'])
    #print(logits.shape)

    # Get target labels for the fraud detection task
    labels = g.edata['isFraud']
    breakpoint()
    # Compute loss (use train_mask to filter out test edges)
    #print("labels",labels[train_mask].view(-1,1).shape)
    #print("logits", logits[train_mask].view(-1,1).shape)
    loss = loss_fn(logits[train_mask].view(-1,1), labels[train_mask].view(-1,1))
    
    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss every few epochs
    if (epoch + 1) % 2 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")



Epoch 2/200, Loss: 181141.859375
Epoch 4/200, Loss: 157430.96875
Epoch 6/200, Loss: 141447.1875
Epoch 8/200, Loss: 124156.8125
Epoch 10/200, Loss: 105468.703125
Epoch 12/200, Loss: 104094.1171875
Epoch 14/200, Loss: 91447.046875
Epoch 16/200, Loss: 83232.3984375
Epoch 18/200, Loss: 77547.59375
Epoch 20/200, Loss: 70832.34375
Epoch 22/200, Loss: 64617.90625
Epoch 24/200, Loss: 57601.42578125
Epoch 26/200, Loss: 52511.15625
Epoch 28/200, Loss: 48590.59375
Epoch 30/200, Loss: 41939.39453125
Epoch 32/200, Loss: 41129.41796875
Epoch 34/200, Loss: 34319.625
Epoch 36/200, Loss: 33910.08984375
Epoch 38/200, Loss: 30555.84375
Epoch 40/200, Loss: 27881.60546875
Epoch 42/200, Loss: 24897.859375
Epoch 44/200, Loss: 23411.298828125
Epoch 46/200, Loss: 21357.138671875
Epoch 48/200, Loss: 19342.193359375
Epoch 50/200, Loss: 17413.33203125
Epoch 52/200, Loss: 16113.263671875
Epoch 54/200, Loss: 15382.1494140625
Epoch 56/200, Loss: 13575.458984375
Epoch 58/200, Loss: 12372.8271484375
Epoch 60/200, Loss

Epoch 2/20, Loss: 50026236.0
Epoch 4/20, Loss: 21910472.0
Epoch 6/20, Loss: 12671362.0
Epoch 8/20, Loss: 12492275.0
Epoch 10/20, Loss: 13373789.0
Epoch 12/20, Loss: 14267830.0
Epoch 14/20, Loss: 14137046.0
Epoch 16/20, Loss: 12949893.0
Epoch 18/20, Loss: 11810090.0
Epoch 20/20, Loss: 10091464.0
0.1
Accuracy: 0.7332
Recall: 0.0086
F1 Score: 0.0073
0.01
Accuracy: 0.7535
Recall: 0.0039
F1 Score: 0.0036
0.2
Accuracy: 0.7378
Recall: 0.0102
F1 Score: 0.0087
0.4
Accuracy: 0.7405
Recall: 0.0079
F1 Score: 0.0068

In [24]:


# Evaluate the model
model.eval()
with torch.no_grad():
    logits = model(g, g.ndata['features'])

    # Convert logits to probabilities
    predictions = torch.sigmoid(logits).squeeze()
    
    # Apply threshold of 0.5 to classify fraud
    predicted_labels = (predictions > 0.5).float()
    predicted_labels = predicted_labels[test_mask]  # Apply test_mask here
    # Get actual labels
    #true_labels = g.edata['isFraud']
    true_labels = g.edata['isFraud'][test_mask]  # Apply test_mask here

    reverse_user_mapping = {v: k for k, v in user_mapping.items()}  # Reverse mapping
    
    # Retrieve node indices from test_mask
    #test_edge_indices = test_mask.nonzero().squeeze().tolist()
    src, dst = g.edges()
    src_test = src[test_mask]
    dst_test = dst[test_mask]
        
    balance_orig_test = g.ndata['features'][src_test, 0].tolist()
    balance_dest_test = g.ndata['features'][dst_test, 1].tolist()
    
    # Map predictions back to transactions with their original balances
    mapped_results = [
        {
            "orig_user": reverse_user_mapping[src_node.item()],
            "dest_user": reverse_user_mapping[dst_node.item()],
            "predicted_label": int(pred),
            "true_label": int(true_lab),
            "balance_orig": balance_orig,
            "balance_dest": balance_dest
        }
        for src_node, dst_node, pred, true_lab, balance_orig, balance_dest in zip(
            src_test, dst_test, predicted_labels.tolist(), true_labels.tolist(), balance_orig_test, balance_dest_test
        )
    ]
    # Compute accuracy manually
    correct = (predicted_labels == true_labels).sum().item()
    total = true_labels.size(0)
    accuracy = correct / total

    # Print evaluation results
    print(f"Accuracy: {accuracy:.4f}")
    # Calculate F1-Score
    f1 = f1_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    print(f"Recall: {recall:.4f}")
    print(f"Precision:{precision:.4f}")
    print(f"F1 Score: {f1:.4f}")
    for result in mapped_results[1:10]:
        print(result)

Accuracy: 0.7075
Recall: 0.6301
Precision:0.7071
F1 Score: 0.6664
{'orig_user': 'C1715567191', 'dest_user': 'C688227079', 'predicted_label': 0, 'true_label': 0, 'balance_orig': 26673.0, 'balance_dest': 9131827.0}
{'orig_user': 'C384906661', 'dest_user': 'C244625639', 'predicted_label': 0, 'true_label': 0, 'balance_orig': 0.0, 'balance_dest': 3202547.75}
{'orig_user': 'C1050766872', 'dest_user': 'C1332247082', 'predicted_label': 0, 'true_label': 0, 'balance_orig': 10068.0, 'balance_dest': 115406.921875}
{'orig_user': 'C71055143', 'dest_user': 'C1732817669', 'predicted_label': 0, 'true_label': 1, 'balance_orig': 271908.40625, 'balance_dest': 1488088.25}
{'orig_user': 'C744241593', 'dest_user': 'M967615944', 'predicted_label': 0, 'true_label': 0, 'balance_orig': 0.0, 'balance_dest': 0.0}
{'orig_user': 'C294569503', 'dest_user': 'C229133194', 'predicted_label': 1, 'true_label': 1, 'balance_orig': 3309631.0, 'balance_dest': 0.0}
{'orig_user': 'C1223797124', 'dest_user': 'C1631519993', 'pred

In [16]:
pos_weight

tensor([0.7793])

In [17]:
sum(labels==1)

tensor(6408)