In [1]:
# This is a sample Python script.
import pandas as pd
import torch
device = torch.device('cpu')
import dgl
import networkx as nx
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
import dgl.function as fn


In [2]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)

print(df.head())

         step      type     amount     nameOrig  oldbalanceOrg  \
5615431   395  TRANSFER  308362.85   C140079217            0.0   
4012845   299  CASH_OUT  255018.29   C665653517         2010.0   
4178020   304   PAYMENT   55154.11   C451947605            0.0   
6207478   587   PAYMENT   25232.50   C461231178        80849.0   
482370     19   PAYMENT    1819.55  C1654714032       238819.0   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
5615431            0.00  C2051203950       817878.27      1126241.12        0   
4012845            0.00   C463826132      4685364.02      4940382.31        0   
4178020            0.00   M404968312            0.00            0.00        0   
6207478        55616.50   M880297774            0.00            0.00        0   
482370        236999.45   M451967605            0.00            0.00        0   

         isFlaggedFraud  
5615431               0  
4012845               0  
4178020               0  
6207478     

In [3]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.99

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")


Fraud count: 6544
Non-fraud count: 49935
Fraud ratio: 0.1159
Non-fraud ratio: 0.8841


In [4]:
# Create a mapping from unique user names to numeric IDs (nodes)
user_mapping = {user: idx for idx, user in enumerate(set(df['nameOrig']).union(set(df['nameDest'])))}

# Create edges between nameOrig and nameDest
src = df['nameOrig'].map(user_mapping).values
dst = df['nameDest'].map(user_mapping).values

In [5]:
print(src)

[ 76455  99097 106274 ...  80441   8708  82720]


In [6]:
# Create a DGL graph from the source and destination nodes
g = dgl.graph((src, dst))


In [7]:
# Add transaction amount as edge feature
g.edata['amount'] = torch.tensor(df['amount'].values, dtype=torch.float32)

# Optional: Add fraud information to edge features
g.edata['isFraud'] = torch.tensor(df['isFraud'].values, dtype=torch.float32)

# Initialize node features with zeros (this handles all nodes)
num_nodes = g.num_nodes()
balance_orig = torch.zeros(num_nodes, dtype=torch.float32)
balance_dest = torch.zeros(num_nodes, dtype=torch.float32)

for orig_user, balance in df[['nameOrig', 'oldbalanceOrg']].drop_duplicates().values:
    balance_orig[user_mapping[orig_user]] = balance

for dest_user, balance in df[['nameDest', 'oldbalanceDest']].drop_duplicates().values:
    balance_dest[user_mapping[dest_user]] = balance

node_features = torch.stack([balance_orig, balance_dest], dim=1)  # Changed to stack both features

g.ndata['features'] = node_features

node_labels = torch.zeros(num_nodes, dtype=torch.float32)

# Map 'isFraud' values to corresponding nodes based on user mapping
for user, fraud in df[['nameOrig', 'isFraud']].drop_duplicates().values:
    node_labels[user_mapping[user]] = fraud


# Map 'isFraud' to destination nodes (nameDest)
for dest_user, fraud in df[['nameDest', 'isFraud']].drop_duplicates().values:
    node_labels[user_mapping[dest_user]] = fraud

# Store the fraud labels in g.ndata['isFraud']
g.ndata['isFraud'] = node_labels


In [8]:
g.edata['isFraud'].shape
#node_features.shape
g.num_edges

<bound method DGLGraph.num_edges of Graph(num_nodes=110779, num_edges=56479,
      ndata_schemes={'features': Scheme(shape=(2,), dtype=torch.float32), 'isFraud': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'amount': Scheme(shape=(), dtype=torch.float32), 'isFraud': Scheme(shape=(), dtype=torch.float32)})>

In [9]:
# Print graph information
#print(g)

# Visualize the graph using NetworkX (convert DGL graph to NetworkX for visualization)
#nx_graph = g.to_networkx()

# Optional: Visualize using a layout for better readability
#pos = nx.spring_layout(nx_graph)  # Use a layout for better visualization
#plt.figure(figsize=(12, 12))
#nx.draw(nx_graph, pos, node_size=50, node_color='skyblue', font_size=10, with_labels=True)
#plt.show()

In [10]:
# Assuming that you have edge features like transaction amounts or fraud status
edge_features = g.edata.get('amount', None)  # Assuming 'amount' is an edge feature
if edge_features is not None:
    print("Edge Features (Transaction Amounts):")
    print(edge_features)
else:
    print("No edge features found.")


Edge Features (Transaction Amounts):
tensor([9.6495e+04, 8.8894e+03, 1.9429e+03,  ..., 1.4209e+04, 2.1160e+01,
        1.2166e+05])


In [11]:
# Select 80% of nodes for training
num_train_nodes = int(0.8 * num_nodes)
train_indices = torch.randperm(num_nodes)[:num_train_nodes]  
test_indices = torch.tensor([i for i in range(num_nodes) if i not in train_indices])

# Create train/test masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_indices] = True
test_mask[test_indices] = True

# Assign to graph
g.ndata['train_mask'] = train_mask
g.ndata['test_mask'] = test_mask


In [12]:
# Define the GraphSAGE model for fraud detection
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GraphSAGE, self).__init__()
        self.layer1 = dgl.nn.SAGEConv(in_feats, hidden_feats, 'mean')
        self.layer2 = dgl.nn.SAGEConv(hidden_feats, out_feats, 'mean')
        self.dropout = nn.Dropout(0.5)
        #self.fc = nn.Linear(out_feats * 2, 1)  # * Concatenate source and destination node embeddings *
        self.fc = nn.Linear(out_feats, 1)  # Output a single value per node (fraud score)

    def forward_old2(self, blocks, features):
        # Apply first GraphSAGE layer and ReLU
        x = self.layer1(blocks[0], features)
        x = torch.relu(x)
        x = self.dropout(x)

        # Apply second GraphSAGE layer to get node embeddings
        x = self.layer2(blocks[1], x)

        # Get source and destination nodes for each edge
        src, dst = blocks[1].edges()  # Use the second block for destination nodes
        src_local, dst_local = blocks[1].srcdata['_ID'], blocks[1].dstdata['_ID']

        # Embeddings for source and destination nodes (local indices)
        print(src_local[1:5])
        print(x)
        src_embeddings = x[src_local]  # Embeddings for source nodes
        dst_embeddings = x[dst_local]  # Embeddings for destination nodes
        
        # Concatenate source and destination node embeddings to create edge features
        edge_features = torch.cat([src_embeddings, dst_embeddings], dim=1)  # Concatenate along the feature dimension
        
        # Output a prediction for each edge (fraud score)
        logits = self.fc(edge_features).squeeze()  # Output a single value per edge
        
        return logits

    def forward(self, g, features):
        # Apply first GraphSAGE layer and ReLU
        x = self.layer1(g, features)
        x = torch.relu(x)
        x = self.dropout(x)

        # Apply second GraphSAGE layer to get node embeddings
        x = self.layer2(g, x)
        
        # Output a prediction for each node
        logits = self.fc(x).squeeze()  # Output a single value per node
        return logits
        

    def forward_old(self, blocks, features):
        # Apply first GraphSAGE layer and ReLU
        x = self.layer1(blocs[0], features)
        x = torch.relu(x)
        x = self.dropout(x)

        # Apply second GraphSAGE layer to get node embeddings
        x = self.layer2(blocs[1], x)

        # Get source and destination nodes for each edge
        src, dst = blocks[1].edges()  # Use the second block for destination nodes
        src_local, dst_local = blocks[1].srcdata['_ID'], blocks[1].dstdata['_ID']
        
        # Map to global node IDs
        src_global = blocks[1].srcdata['_ID'][src]
        dst_global = blocks[1].dstdata['_ID'][dst]
        src_embeddings = x[src_local]  # Embeddings for source nodes
        dst_embeddings = x[dst_local]  # Embeddings for destination nodes
        
        # Concatenate source and destination node embeddings to create edge features
        edge_features = torch.cat([src_embeddings, dst_embeddings], dim=1)  # Concatenate along the feature dimension
        
        # Output a prediction for each node
        logits = self.fc(edge_features).squeeze()  # Output a single value per node
        return logits
        
    def forward_edge(self, g, features):
        # Apply first GraphSAGE layer and ReLU
        x = self.layer1(g, features)
        x = torch.relu(x)
        x = self.dropout(x)

        # Apply second GraphSAGE layer to get embeddings
        x = self.layer2(g, x)
        
        # Get source and destination nodes for each edge
        src, dst = g.edges()  # Get indices of source and destination nodes
        src_embeddings = x[src]  # Embeddings for source nodes
        dst_embeddings = x[dst]  # Embeddings for destination nodes
        
        # Concatenate source and destination node embeddings to create edge features
        edge_features = torch.cat([src_embeddings, dst_embeddings], dim=1)  # Concatenate along the feature dimension
        logits = self.fc(edge_features).squeeze()  # * Output a single value per edge (fraud score) *
        return logits


In [13]:
# Count occurrences of each label in the 'isFraud' column
label_counts = df['isFraud'].value_counts()

# Calculate the proportion of each class
fraud_ratio = label_counts[1] / len(df)  # Assuming '1' represents fraud
non_fraud_ratio = label_counts[0] / len(df)  # Assuming '0' represents non-fraud

# Print the results
print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")

Fraud count: 6544
Non-fraud count: 49935
Fraud ratio: 0.1159
Non-fraud ratio: 0.8841


In [14]:
from dgl.dataloading import NeighborSampler, DataLoader

# Define batch size
batch_size = 512  

# Define a NeighborSampler (sampling 10 neighbors per layer)
sampler = NeighborSampler([10, 10])  # 2-layer GraphSAGE with 10 neighbors per node

g.ndata['train_mask']=train_mask  # Boolean mask for training nodes
train_nid = torch.nonzero(train_mask, as_tuple=True)[0]  # Get indices

# Create a DataLoader for mini-batching
train_dataloader = DataLoader(
    g,                # Full DGL graph
    train_nid,        # Training node IDs (subset of g.ndata['train_mask'])
    sampler,          # Neighbor sampler
    batch_size=batch_size,
    shuffle=True,
    drop_last=False,
    num_workers=4
)


In [15]:


def compose_graph_from_blocks(blocks):
    # Extract the nodes and edges from the blocks
    block1, block2 = blocks

    # Get source and destination nodes for both blocks
    src_nodes_block1 = block1.srcdata['_ID']
    dst_nodes_block1 = block1.dstdata['_ID']
    
    src_nodes_block2 = block2.srcdata['_ID']
    dst_nodes_block2 = block2.dstdata['_ID']

    # Get the edges (src, dst) for both blocks
    src_edges_block1, dst_edges_block1 = block1.edges()
    src_edges_block2, dst_edges_block2 = block2.edges()

    # Create a new graph combining the nodes and edges from both blocks
    # Add all the edges and nodes from both blocks
    edges_src_all = torch.cat([src_edges_block1, src_edges_block2])
    edges_dst_all = torch.cat([dst_edges_block1, dst_edges_block2])

    # Create a new graph from these nodes and edges
    g = dgl.graph((edges_src_all, edges_dst_all))

    # Ensure that the node IDs are consistent across the blocks
    # Create a mapping from block-local node IDs to global node IDs
    node_id_mapping = {**dict(zip(src_nodes.tolist(), range(len(src_nodes)))), 
                       **dict(zip(dst_nodes.tolist(), range(len(dst_nodes))))}

    # Concatenate node features from both blocks, ensuring correct feature assignment
    src_features = block1.srcdata['features']
    dst_features = block2.dstdata['features']

    # The final number of features should match the total number of nodes
    all_features = torch.cat([src_features, dst_features], dim=0)
        # Print out some debugging information to understand the shapes
    print(f"Shape of src_features: {src_features.shape}")
    print(f"Shape of dst_features: {dst_features.shape}")
    print(f"Shape of all_features: {all_features.shape}")
    print(f"Number of nodes in composed graph: {g.num_nodes()}")

    # Ensure the number of features matches the number of nodes in the graph
    assert len(all_features) == g.num_nodes(), "Mismatch between features and nodes."

    # Set the node features for the composed graph
    g.ndata['features'] = all_features

    return g




In [31]:
# Instantiate the model
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
in_feats = 2  # balanceOrig and balanceDest features
hidden_feats = 64
out_feats = 1  # Fraud (binary classification)

pos_weight = torch.tensor([non_fraud_ratio/fraud_ratio],dtype=torch.float)
#pos_weight = torch.tensor([1.0],dtype=torch.float)


model = GraphSAGE(in_feats, hidden_feats, out_feats)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Train the model
epochs = 50
for epoch in range(epochs):
    model.train()
    for step, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
        # Forward pass
        #print(len(blocks))
        #composed_graph = compose_graph_from_blocks(blocks)
        input_features=g.ndata['features'][input_nodes]
        input_nodes_batch = torch.tensor(input_nodes)
        subgraph = dgl.node_subgraph(g, input_nodes_batch)
        logits = model(subgraph, input_features)
        #print(logits.shape)
    
        # Get target labels for the fraud detection task
        labels = g.ndata['isFraud'][input_nodes]
        breakpoint()
        # Compute loss (use train_mask to filter out test edges)
        #print("labels",labels[train_mask].view(-1,1).shape)
        #print("logits", logits[train_mask].view(-1,1).shape)
        loss = loss_fn(logits, labels)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        # Print loss every few epochs
    if (epoch + 1) % 2 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")



  input_nodes_batch = torch.tensor(input_nodes)


Epoch 2/50, Loss: 53319.703125
Epoch 4/50, Loss: 18826.716796875
Epoch 6/50, Loss: 5770.92041015625
Epoch 8/50, Loss: 375.16619873046875
Epoch 10/50, Loss: 15.654949188232422
Epoch 12/50, Loss: 3.575941324234009
Epoch 14/50, Loss: 9.680578231811523
Epoch 16/50, Loss: 14.071416854858398
Epoch 18/50, Loss: 18.81200408935547
Epoch 20/50, Loss: 14.072147369384766
Epoch 22/50, Loss: 4.734349727630615
Epoch 24/50, Loss: 1.4067397117614746
Epoch 26/50, Loss: 4.717514991760254
Epoch 28/50, Loss: 4.274027347564697
Epoch 30/50, Loss: 2.1701550483703613
Epoch 32/50, Loss: 9.987435340881348
Epoch 34/50, Loss: 6.464606285095215
Epoch 36/50, Loss: 6.07663631439209
Epoch 38/50, Loss: 5.622628211975098
Epoch 40/50, Loss: 5.088200569152832
Epoch 42/50, Loss: 1.4591577053070068
Epoch 44/50, Loss: 9.314507484436035
Epoch 46/50, Loss: 8.872360229492188
Epoch 48/50, Loss: 6.083202362060547
Epoch 50/50, Loss: 11.18979549407959


Epoch 2/20, Loss: 50026236.0
Epoch 4/20, Loss: 21910472.0
Epoch 6/20, Loss: 12671362.0
Epoch 8/20, Loss: 12492275.0
Epoch 10/20, Loss: 13373789.0
Epoch 12/20, Loss: 14267830.0
Epoch 14/20, Loss: 14137046.0
Epoch 16/20, Loss: 12949893.0
Epoch 18/20, Loss: 11810090.0
Epoch 20/20, Loss: 10091464.0
0.1
Accuracy: 0.7332
Recall: 0.0086
F1 Score: 0.0073
0.01
Accuracy: 0.7535
Recall: 0.0039
F1 Score: 0.0036
0.2
Accuracy: 0.7378
Recall: 0.0102
F1 Score: 0.0087
0.4
Accuracy: 0.7405
Recall: 0.0079
F1 Score: 0.0068

In [24]:
g.ndata.keys()

dict_keys(['features', 'isFraud', 'train_mask', 'test_mask'])

In [21]:


# Evaluate the model
model.eval()
with torch.no_grad():
    logits = model(g, g.ndata['features'])

    # Convert logits to probabilities
    predictions = torch.sigmoid(logits).squeeze()
    
    # Apply threshold of 0.5 to classify fraud
    predicted_labels = (predictions > 0.5).float()
    predicted_labels = predicted_labels[test_mask]  # Apply test_mask here
    # Get actual labels
    #true_labels = g.edata['isFraud']
    true_labels = g.ndata['isFraud'][test_mask]  # Apply test_mask here
    # Compute accuracy manually
    correct = (predicted_labels == true_labels).sum().item()
    total = true_labels.size(0)
    accuracy = correct / total

    # Print evaluation results
    print(f"Accuracy: {accuracy:.4f}")
    # Calculate F1-Score
    f1 = f1_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    print(f"Recall: {recall:.4f}")
    print(f"Precision:{precision:.4f}")
    print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6978
Recall: 0.6397
Precision:0.2181
F1 Score: 0.3253


In [19]:
g.ndata['features']
predicted_labels

tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [20]:
sum(labels==1)

tensor(81)

Accuracy: 0.2478
Recall: 0.9758
Precision:0.1310
F1 Score: 0.2309

Accuracy: 0.5616
Recall: 0.8884
Precision:0.1945
F1 Score: 0.3192