In [50]:
import pandas as pd
import torch
device = torch.device('cpu')
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [51]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)
#df=df.sample(n=500000)


print(df.head())

         step      type     amount     nameOrig  oldbalanceOrg  \
3737337   278   CASH_IN  330546.72  C1434018178         1555.0   
1885586   164   PAYMENT    3947.97  C1311563057         8630.0   
889947     42  CASH_OUT  428874.96  C2057004509        69109.0   
505132     20  CASH_OUT   66125.64   C206372042            0.0   
3983729   298   PAYMENT   11141.33   C581455749            0.0   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
3737337       332101.72   C591420716            0.00            0.00        0   
1885586         4682.03  M1120433087            0.00            0.00        0   
889947             0.00  C1314425213         8002.00       433943.13        0   
505132             0.00   C137318700      1284044.89      1350170.53        0   
3983729            0.00  M1528457402            0.00            0.00        0   

         isFlaggedFraud  
3737337               0  
1885586               0  
889947                0  
505132      

In [52]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.99

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")
print(df.shape)


Fraud count: 6468
Non-fraud count: 49935
Fraud ratio: 0.1147
Non-fraud ratio: 0.8853
(56403, 11)


In [53]:
# Create a mapping from unique user names to numeric IDs (nodes)
from sklearn.preprocessing import LabelEncoder
orig_encoder = LabelEncoder()
dest_encoder = LabelEncoder()

df['nameOrig_encoded'] = orig_encoder.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = dest_encoder.fit_transform(df['nameDest'])

In [54]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,nameOrig_encoded,nameDest_encoded
2953142,230,CASH_OUT,299108.59,C439670549,52037.0,0.0,C1533996817,6073059.91,6372168.5,0,0,40042,10094
5569001,393,CASH_IN,311043.55,C239356840,993503.04,1304546.59,C1587837852,6928751.77,6617708.22,0,0,34336,11120
1632456,157,CASH_IN,126877.34,C46649736,246628.94,373506.28,C771397487,226021.1,99143.77,0,0,40855,32737
3597811,263,CASH_OUT,94757.45,C1051247968,93648.76,0.0,C303666365,431077.78,525835.23,0,0,1516,23684
5494766,380,CASH_OUT,68414.15,C482468166,0.0,0.0,C1708721681,2068163.83,2136577.98,0,0,41338,13478


In [55]:
# Convert to tensors
# Get the number of unique identifiers
num_orig = df["nameOrig_encoded"].nunique()
num_dest = df["nameDest_encoded"].nunique()

from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_orig = torch.tensor(df["nameOrig_encoded"].values, dtype=torch.long)
X_dest = torch.tensor(df["nameDest_encoded"].values, dtype=torch.long)
y = torch.tensor(df["isFraud"].values, dtype=torch.float32).unsqueeze(1)

# Split dataset
X_orig_train, X_orig_test, X_dest_train, X_dest_test, y_train, y_test = train_test_split(
    X_orig, X_dest, y, test_size=0.2, random_state=42
)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_orig_train, X_dest_train, y_train)
test_dataset = TensorDataset(X_orig_test, X_dest_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [56]:
class FraudModel(nn.Module):
    def __init__(self, num_orig, num_dest, embedding_dim=8, num_hidden=16):
        super(FraudModel, self).__init__()

        # Learnable embeddings (no autoencoder needed)
        self.orig_embedding = nn.Embedding(num_orig, embedding_dim)
        self.dest_embedding = nn.Embedding(num_dest, embedding_dim)

        # Fully connected network
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 2, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, 1),
            nn.Sigmoid()
        )

    def forward(self, orig, dest):
        orig_emb = self.orig_embedding(orig)
        dest_emb = self.dest_embedding(dest)
        x = torch.cat((orig_emb, dest_emb), dim=1)
        return self.fc(x)





In [66]:
def train_model(model, train_loader, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for orig, dest, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(orig, dest)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Initialize and train model
model = FraudModel(num_orig, num_dest)
optimizer = optim.Adam(model.parameters(), lr=0.001)
pos_weight = torch.tensor([non_fraud_ratio/fraud_ratio],dtype=torch.float)
#pos_weight = torch.tensor([1.0],dtype=torch.float)
#criterion = nn.BCELoss(pos_weight=pos_weight)  # Binary Cross Entropy
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

train_model(model, train_loader, optimizer)




Epoch 1, Loss: 873.0770
Epoch 2, Loss: 865.8788
Epoch 3, Loss: 864.5652
Epoch 4, Loss: 852.5736
Epoch 5, Loss: 833.9004
Epoch 6, Loss: 812.2285
Epoch 7, Loss: 789.2254
Epoch 8, Loss: 768.8357
Epoch 9, Loss: 749.6346
Epoch 10, Loss: 732.1172
Epoch 11, Loss: 717.1047
Epoch 12, Loss: 706.1206
Epoch 13, Loss: 694.3984
Epoch 14, Loss: 686.7602
Epoch 15, Loss: 680.8507
Epoch 16, Loss: 676.2119
Epoch 17, Loss: 672.7436
Epoch 18, Loss: 670.2161
Epoch 19, Loss: 668.1208
Epoch 20, Loss: 666.9742


In [67]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for orig, dest, labels in test_loader:
            outputs = model(orig, dest)
            preds = (outputs > 0.5).float()  # Convert probabilities to 0/1
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(model, test_loader)



Test Accuracy: 0.5749
Precision: 0.1106
Recall: 0.3889
F1-score: 0.1722


Test Accuracy: 0.5641
Precision: 0.1161
Recall: 0.4367
F1-score: 0.1835