In [19]:
# This is a sample Python script.
import pandas as pd
import torch
device = torch.device('cpu')
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [20]:
df=pd.read_csv("../data/PaySim_kaggle.csv")

df=df.sample(n=5000000)
#df=df.sample(n=500000)


print(df.head())

         step      type     amount     nameOrig  oldbalanceOrg  \
2415348   202   PAYMENT    6453.68  C1466128890           0.00   
1325883   137  CASH_OUT     472.66   C230571364           0.00   
2817334   225   CASH_IN  106653.96  C1044101442       52089.00   
3259836   251  CASH_OUT  312174.49  C1153970440          53.00   
91773      10   CASH_IN  241350.40    C91041415     3657362.32   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
2415348            0.00  M1413211184            0.00            0.00        0   
1325883            0.00   C933244703       270258.35       270731.01        0   
2817334       158742.96   C738674915       482029.01       375375.05        0   
3259836            0.00   C765402023        13935.75       326110.24        0   
91773        3898712.71   C187569650       983297.77       282071.98        0   

         isFlaggedFraud  
2415348               0  
1325883               0  
2817334               0  
3259836     

Adjust data proportions as the fraud data is heavily biased having less fraudulant cases.

In [21]:
# Set the proportion of non-fraud to remove (e.g., 50% of non-fraud instances)
remove_fraction = 0.9985

# Separate the fraud and non-fraud instances
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Randomly sample and remove 'remove_fraction' proportion of non-fraud instances
non_fraud_to_remove = non_fraud_df.sample(frac=remove_fraction, random_state=42)

# Drop the sampled non-fraud instances from the DataFrame
df = df.drop(non_fraud_to_remove.index)

# Verify the new balance
label_counts = df['isFraud'].value_counts()
fraud_ratio = label_counts[1] / len(df)
non_fraud_ratio = label_counts[0] / len(df)

print(f"Fraud count: {label_counts[1]}")
print(f"Non-fraud count: {label_counts[0]}")
print(f"Fraud ratio: {fraud_ratio:.4f}")
print(f"Non-fraud ratio: {non_fraud_ratio:.4f}")
print(df.shape)


Fraud count: 6479
Non-fraud count: 7490
Fraud ratio: 0.4638
Non-fraud ratio: 0.5362
(13969, 11)


In [22]:
# Create a mapping from unique user names to numeric IDs (nodes)
from sklearn.preprocessing import LabelEncoder
orig_encoder = LabelEncoder()
dest_encoder = LabelEncoder()

df['nameOrig_encoded'] = orig_encoder.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = dest_encoder.fit_transform(df['nameDest'])

In [23]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,nameOrig_encoded,nameDest_encoded
6129789,542,TRANSFER,288698.39,C895157771,69.0,0.0,C2033770857,149150.99,437849.38,0,0,13171,5969
3438226,256,PAYMENT,24044.85,C1714327915,20499.0,0.0,M79084875,0.0,0.0,0,0,5205,13574
5987928,411,TRANSFER,365069.45,C337746955,365069.45,0.0,C1144989996,0.0,0.0,1,0,9185,828
1030438,65,CASH_OUT,446032.95,C289101217,446032.95,0.0,C514505298,205977.41,652010.36,1,0,8852,8440
2750119,212,PAYMENT,43973.7,C370824790,0.0,0.0,M620922867,0.0,0.0,0,0,9427,13353


In [24]:
# Convert to tensors
# Get the number of unique identifiers
num_orig = df["nameOrig_encoded"].nunique()
num_dest = df["nameDest_encoded"].nunique()

from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_orig = torch.tensor(df["nameOrig_encoded"].values, dtype=torch.long)
print(df["nameOrig_encoded"].values.shape)
X_dest = torch.tensor(df["nameDest_encoded"].values, dtype=torch.long)
y = torch.tensor(df["isFraud"].values, dtype=torch.float32).unsqueeze(1)

# Split dataset
X_orig_train, X_orig_test, X_dest_train, X_dest_test, y_train, y_test = train_test_split(
    X_orig, X_dest, y, test_size=0.2, random_state=42
)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_orig_train, X_dest_train, y_train)
test_dataset = TensorDataset(X_orig_test, X_dest_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



(13969,)


In [25]:
class FraudModel(nn.Module):
    def __init__(self, num_orig, num_dest, embedding_dim=8, num_hidden=16):
        super(FraudModel, self).__init__()

        # Learnable embeddings (no autoencoder needed)
        self.orig_embedding = nn.Embedding(num_orig, embedding_dim)
        self.dest_embedding = nn.Embedding(num_dest, embedding_dim)

        # Fully connected network
        self.conv = nn.Sequential(
            #nn.Linear(embedding_dim * 2, 16),
            nn.Conv2d(in_channels=1, out_channels=num_hidden, kernel_size=(3,1), padding=(1,0)),
            nn.ReLU(),
            nn.Conv2d(in_channels=num_hidden, out_channels=num_hidden, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU()
        )
        
        self.fc=nn.Sequential(
            nn.Flatten(),
            nn.Linear(num_hidden*embedding_dim*2, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, 1),
        )

    def forward(self, orig, dest):
        orig_emb = self.orig_embedding(orig)
        #print("in forward", orig_emb.shape)
        dest_emb = self.dest_embedding(dest)
        #print("in forward", dest_emb.shape)
        #testi=(orig_emb, dest_emb)
        x=torch.stack((orig_emb, dest_emb), dim=1).unsqueeze(1)
        #print("in forward", x.shape)
        x=self.conv(x)
        x=self.fc(x)
        
        return x


In [28]:
def train_model(model, train_loader, optimizer, epochs=30):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for orig, dest, labels in train_loader:
            optimizer.zero_grad()
            #print(orig.shape)
            #print(dest.shape)
            #print(orig)
            outputs = model(orig, dest)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Initialize and train model
model = FraudModel(num_orig, num_dest)
optimizer = optim.Adam(model.parameters(), lr=0.001)
#criterion = nn.BCELoss()  # Binary Cross Entropy
pos_weight = torch.tensor([non_fraud_ratio/fraud_ratio],dtype=torch.float)
#criterion = nn.BCELoss(pos_weight=pos_weight)  # Binary Cross Entropy
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
train_model(model, train_loader, optimizer)




Epoch 1, Loss: 130.0188
Epoch 2, Loss: 129.8397
Epoch 3, Loss: 129.0718
Epoch 4, Loss: 127.0122
Epoch 5, Loss: 123.5169
Epoch 6, Loss: 118.4008
Epoch 7, Loss: 112.2738
Epoch 8, Loss: 104.1805
Epoch 9, Loss: 95.1301
Epoch 10, Loss: 85.1575
Epoch 11, Loss: 74.9688
Epoch 12, Loss: 64.7468
Epoch 13, Loss: 54.4259
Epoch 14, Loss: 45.4685
Epoch 15, Loss: 37.0497
Epoch 16, Loss: 29.4786
Epoch 17, Loss: 22.9253
Epoch 18, Loss: 17.4330
Epoch 19, Loss: 12.8970
Epoch 20, Loss: 9.4435
Epoch 21, Loss: 6.8559
Epoch 22, Loss: 4.9054
Epoch 23, Loss: 3.4359
Epoch 24, Loss: 2.4860
Epoch 25, Loss: 1.6964
Epoch 26, Loss: 1.2520
Epoch 27, Loss: 0.8290
Epoch 28, Loss: 0.6113
Epoch 29, Loss: 0.4411
Epoch 30, Loss: 0.3382


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for orig, dest, labels in test_loader:
            outputs = model(orig, dest)
            preds = (outputs > 0.5).float()  # Convert probabilities to 0/1
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(model, test_loader)



Test Accuracy: 0.4893
Precision: 0.4651
Recall: 0.5352
F1-score: 0.4977


Epoch 10 Test Accuracy: 0.6458
Precision: 0.1172
Recall: 0.3113
F1-score: 0.1703

Test Accuracy: 0.4002
Precision: 0.1214
Recall: 0.6636
F1-score: 0.2053