PaySim experiment with a simple feedforward network.

In [1]:
import pandas as pd
import torch
device = torch.device('cpu')
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [2]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)
#df=df.sample(n=500000)


print(df.head())

         step      type     amount     nameOrig  oldbalanceOrg  \
3355445   254   PAYMENT   11171.09   C397512263            0.0   
3139025   236  TRANSFER  141329.32  C1851763803        21569.0   
1031106    93   PAYMENT   17938.48   C966657919            0.0   
1953827   178   PAYMENT   14321.01   C571973406       126134.0   
3410699   255  TRANSFER  340993.53  C1832604853          111.0   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
3355445            0.00   M982762771            0.00            0.00        0   
3139025            0.00   C209771641      2680514.85      2821844.16        0   
1031106            0.00    M25929709            0.00            0.00        0   
1953827       111812.99  M1218748937            0.00            0.00        0   
3410699            0.00  C1881267262     14609660.47     14950654.00        0   

         isFlaggedFraud  
3355445               0  
3139025               0  
1031106               0  
1953827     

Adjust data proportions as the fraud data is heavily biased having less fraudulant cases.

In [3]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.9985

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")
print(df.shape)


Fraud count: 6503
Non-fraud count: 7490
Fraud ratio: 0.4647
Non-fraud ratio: 0.5353
(13993, 11)


In [4]:
# Create a mapping from unique user names to numeric IDs (nodes)
from sklearn.preprocessing import LabelEncoder
orig_encoder = LabelEncoder()
dest_encoder = LabelEncoder()

df['nameOrig_encoded'] = orig_encoder.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = dest_encoder.fit_transform(df['nameDest'])

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,nameOrig_encoded,nameDest_encoded
1487409,142,PAYMENT,9800.62,C708161683,64372.1,54571.48,M2096316217,0.0,0.0,0,0,11886,12788
6296098,672,TRANSFER,21332.46,C899485739,21332.46,0.0,C1469356710,0.0,0.0,1,0,13254,2744
4272781,307,CASH_OUT,31384.8,C1779520101,0.0,0.0,C611675688,349265.88,380650.68,0,0,5645,9075
6281461,645,CASH_OUT,1576874.04,C1029506944,1576874.04,0.0,C1873712086,0.0,1576874.04,1,0,232,5108
3724038,278,CASH_IN,98207.3,C983981704,105270.0,203477.3,C872301281,4719.97,0.0,0,0,13853,10640


In [6]:
# Convert to tensors
# Get the number of unique identifiers
num_orig = df["nameOrig_encoded"].nunique()
num_dest = df["nameDest_encoded"].nunique()

from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_orig = torch.tensor(df["nameOrig_encoded"].values, dtype=torch.long)
X_dest = torch.tensor(df["nameDest_encoded"].values, dtype=torch.long)
y = torch.tensor(df["isFraud"].values, dtype=torch.float32).unsqueeze(1)

# Split dataset
X_orig_train, X_orig_test, X_dest_train, X_dest_test, y_train, y_test = train_test_split(
    X_orig, X_dest, y, test_size=0.2, random_state=42
)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_orig_train, X_dest_train, y_train)
test_dataset = TensorDataset(X_orig_test, X_dest_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [12]:
class FraudModel(nn.Module):
    def __init__(self, num_orig, num_dest, embedding_dim=8, num_hidden=16):
        super(FraudModel, self).__init__()

        # Learnable embeddings (no autoencoder needed)
        self.orig_embedding = nn.Embedding(num_orig, embedding_dim)
        self.dest_embedding = nn.Embedding(num_dest, embedding_dim)

        # Fully connected network
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 2, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, 1),
        )

    def forward(self, orig, dest):
        orig_emb = self.orig_embedding(orig)
        dest_emb = self.dest_embedding(dest)
        x = torch.cat((orig_emb, dest_emb), dim=1)
        return self.fc(x)





In [22]:
def train_model(model, train_loader, optimizer, epochs=50):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for orig, dest, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(orig, dest)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Initialize and train model
model = FraudModel(num_orig, num_dest)
optimizer = optim.Adam(model.parameters(), lr=0.001)
pos_weight = torch.tensor([non_fraud_ratio/fraud_ratio],dtype=torch.float)
pos_weight = torch.tensor([1.0],dtype=torch.float)
#loss_fn = nn.BCELoss()  # Binary Cross Entropy
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

train_model(model, train_loader, optimizer)




Epoch 1, Loss: 121.4478
Epoch 2, Loss: 120.5092
Epoch 3, Loss: 119.8700
Epoch 4, Loss: 118.5518
Epoch 5, Loss: 116.1647
Epoch 6, Loss: 112.5776
Epoch 7, Loss: 107.7992
Epoch 8, Loss: 101.9350
Epoch 9, Loss: 95.3070
Epoch 10, Loss: 87.6422
Epoch 11, Loss: 79.6660
Epoch 12, Loss: 71.5942
Epoch 13, Loss: 63.6942
Epoch 14, Loss: 56.1496
Epoch 15, Loss: 48.9134
Epoch 16, Loss: 42.4077
Epoch 17, Loss: 36.6277
Epoch 18, Loss: 31.4300
Epoch 19, Loss: 26.8996
Epoch 20, Loss: 22.9100
Epoch 21, Loss: 19.3975
Epoch 22, Loss: 16.3733
Epoch 23, Loss: 13.8242
Epoch 24, Loss: 11.4998
Epoch 25, Loss: 9.6551
Epoch 26, Loss: 8.0002
Epoch 27, Loss: 6.6120
Epoch 28, Loss: 5.4670
Epoch 29, Loss: 4.5380
Epoch 30, Loss: 3.7343
Epoch 31, Loss: 3.1020
Epoch 32, Loss: 2.5590
Epoch 33, Loss: 2.1296
Epoch 34, Loss: 1.7527
Epoch 35, Loss: 1.4702
Epoch 36, Loss: 1.2124
Epoch 37, Loss: 0.9849
Epoch 38, Loss: 0.7938
Epoch 39, Loss: 0.6398
Epoch 40, Loss: 0.5109
Epoch 41, Loss: 0.3809
Epoch 42, Loss: 0.2790
Epoch 43, L

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for orig, dest, labels in test_loader:
            outputs = model(orig, dest)
            preds = (outputs > 0.5).float()  # Convert probabilities to 0/1
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(model, test_loader)



Test Accuracy: 0.4945
Precision: 0.4657
Recall: 0.6008
F1-score: 0.5247


Test Accuracy: 0.5641
Precision: 0.1161
Recall: 0.4367
F1-score: 0.1835